diff --git a/speech/cloud-client/README.md b/speech/cloud-client/README.md index 207c280b891..9a7055f32ff 100644 --- a/speech/cloud-client/README.md +++ b/speech/cloud-client/README.md @@ -103,3 +103,49 @@ Transcribe an audio file with recognition metadata ``` mvn exec:java -DRecognize -Dexec.args="metadata ./resources/commercial_mono.wav" ``` + + +## Diarization +Transcribe an audio file using diarization on an audio file +``` +mvn exec:java -DRecognize -Dexec.args="diarization ./resources/commercial_mono.wav" +``` + +Transcribe an audio file using diarization on an audio file hosted on GCS +``` +mvn exec:java -DRecognize -Dexec.args="diarization gs://cloud-samples-tests/speech/commercial_mono.wav" +``` + +## Multi-channel +Transcribe an audio file with multiple channels +``` +mvn exec:java -DRecognize -Dexec.args="multi-channel ./resources/commercial_stereo.wav" +``` + +Transcribe an audio file hosted on GCS with multiple channels +``` +mvn exec:java -DRecognize -Dexec.args="multi-channel gs://cloud-samples-tests/speech/commercial_stereo.wav" +``` + +## Multi language +Transcribe an audio file with multiple languages +``` +mvn exec:java -DRecognize -Dexec.args="multi-language ./resources/Google_Gnome.wav" +``` + +Transcribe an audio file hosted on GCS with multiple languages +``` +mvn exec:java -DRecognize -Dexec.args="multi-language gs://cloud-samples-tests/speech/Google_Gnome.wav" +``` + +## Word level confidence +Transcribe an audio file with word level confidence +``` +mvn exec:java -DRecognize -Dexec.args="word-level-conf ./resources/audio.raw" +``` + +Transcribe an audio file hosted on GCS with word level confidence +``` +mvn exec:java -DRecognize -Dexec.args="word-level-conf gs://cloud-samples-tests/speech/brooklyn.flac" +``` + diff --git a/speech/cloud-client/pom.xml b/speech/cloud-client/pom.xml index ac7e3e65412..e43fe5422e6 100644 --- a/speech/cloud-client/pom.xml +++ b/speech/cloud-client/pom.xml @@ -40,7 +40,7 @@ com.google.cloud google-cloud-speech - 0.52.0-alpha + 0.55.0-beta diff --git a/speech/cloud-client/resources/commercial_stereo.wav b/speech/cloud-client/resources/commercial_stereo.wav new file mode 100644 index 00000000000..467f3687702 Binary files /dev/null and b/speech/cloud-client/resources/commercial_stereo.wav differ diff --git a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java index 396cc5110ea..9771ad2a8e9 100644 --- a/speech/cloud-client/src/main/java/com/example/speech/Recognize.java +++ b/speech/cloud-client/src/main/java/com/example/speech/Recognize.java @@ -39,27 +39,28 @@ import com.google.cloud.speech.v1p1beta1.WordInfo; import com.google.common.util.concurrent.SettableFuture; import com.google.protobuf.ByteString; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.List; public class Recognize { - /** - * Run speech recognition tasks. - */ + /** Run speech recognition tasks. */ public static void main(String... args) throws Exception { if (args.length < 1) { System.out.println("Usage:"); System.out.printf( "\tjava %s \"\" \"\"\n" - + "Commands:\n" - + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets | model-selection\n" - + "\t| auto-punctuation | stream-punctuation | enhanced-model | metadata\n" - + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " - + "for a Cloud Storage resource (gs://...)\n", + + "Commands:\n" + + "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n" + + "\t| model-selection | auto-punctuation | stream-punctuation | enhanced-model\n" + + "\t| metadata | diarization | multi-channel | multi-language | word-level-conf" + + "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI " + + "for a Cloud Storage resource (gs://...)\n", Recognize.class.getCanonicalName()); return; } @@ -105,6 +106,30 @@ public static void main(String... args) throws Exception { transcribeFileWithEnhancedModel(path); } else if (command.equals("metadata")) { transcribeFileWithMetadata(path); + } else if (command.equals("diarization")) { + if (path.startsWith("gs://")) { + transcribeDiarizationGcs(path); + } else { + transcribeDiarization(path); + } + } else if (command.equals("multi-channel")) { + if (path.startsWith("gs://")) { + transcribeMultiChannelGcs(path); + } else { + transcribeMultiChannel(path); + } + } else if (command.equals("multi-language")) { + if (path.startsWith("gs://")) { + transcribeMultiLanguageGcs(path); + } else { + transcribeMultiLanguage(path); + } + } else if (command.equals("word-level-conf")) { + if (path.startsWith("gs://")) { + transcribeWordLevelConfidenceGcs(path); + } else { + transcribeWordLevelConfidence(path); + } } } @@ -120,14 +145,13 @@ public static void syncRecognizeFile(String fileName) throws Exception { ByteString audioBytes = ByteString.copyFrom(data); // Configure request with local raw PCM audio - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setContent(audioBytes) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder().setContent(audioBytes).build(); // Use blocking call to get audio transcript RecognizeResponse response = speech.recognize(config, audio); @@ -154,15 +178,14 @@ public static void syncRecognizeWords(String fileName) throws Exception { ByteString audioBytes = ByteString.copyFrom(data); // Configure request with local raw PCM audio - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setEnableWordTimeOffsets(true) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setContent(audioBytes) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableWordTimeOffsets(true) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder().setContent(audioBytes).build(); // Use blocking call to get audio transcript RecognizeResponse response = speech.recognize(config, audio); @@ -175,7 +198,8 @@ public static void syncRecognizeWords(String fileName) throws Exception { System.out.printf("Transcription: %s%n", alternative.getTranscript()); for (WordInfo wordInfo : alternative.getWordsList()) { System.out.println(wordInfo.getWord()); - System.out.printf("\t%s.%s sec - %s.%s sec\n", + System.out.printf( + "\t%s.%s sec - %s.%s sec\n", wordInfo.getStartTime().getSeconds(), wordInfo.getStartTime().getNanos() / 100000000, wordInfo.getEndTime().getSeconds(), @@ -194,14 +218,13 @@ public static void syncRecognizeGcs(String gcsUri) throws Exception { // Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS try (SpeechClient speech = SpeechClient.create()) { // Builds the request for remote FLAC file - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); // Use blocking call for getting audio transcript RecognizeResponse response = speech.recognize(config, audio); @@ -216,10 +239,9 @@ public static void syncRecognizeGcs(String gcsUri) throws Exception { } } - /** - * Performs non-blocking speech recognition on raw PCM audio and prints - * the transcription. Note that transcription is limited to 60 seconds audio. + * Performs non-blocking speech recognition on raw PCM audio and prints the transcription. Note + * that transcription is limited to 60 seconds audio. * * @param fileName the path to a PCM audio file to transcribe. */ @@ -232,14 +254,13 @@ public static void asyncRecognizeFile(String fileName) throws Exception { ByteString audioBytes = ByteString.copyFrom(data); // Configure request with local raw PCM audio - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setContent(audioBytes) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder().setContent(audioBytes).build(); // Use non-blocking call for getting file transcription OperationFuture response = @@ -262,8 +283,8 @@ public static void asyncRecognizeFile(String fileName) throws Exception { } /** - * Performs non-blocking speech recognition on remote FLAC file and prints - * the transcription as well as word time offsets. + * Performs non-blocking speech recognition on remote FLAC file and prints the transcription as + * well as word time offsets. * * @param gcsUri the path to the remote LINEAR16 audio file to transcribe. */ @@ -272,15 +293,14 @@ public static void asyncRecognizeWords(String gcsUri) throws Exception { try (SpeechClient speech = SpeechClient.create()) { // Configure remote file request for Linear16 - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setEnableWordTimeOffsets(true) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableWordTimeOffsets(true) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); // Use non-blocking call for getting file transcription OperationFuture response = @@ -299,7 +319,8 @@ public static void asyncRecognizeWords(String gcsUri) throws Exception { System.out.printf("Transcription: %s\n", alternative.getTranscript()); for (WordInfo wordInfo : alternative.getWordsList()) { System.out.println(wordInfo.getWord()); - System.out.printf("\t%s.%s sec - %s.%s sec\n", + System.out.printf( + "\t%s.%s sec - %s.%s sec\n", wordInfo.getStartTime().getSeconds(), wordInfo.getStartTime().getNanos() / 100000000, wordInfo.getEndTime().getSeconds(), @@ -310,8 +331,7 @@ public static void asyncRecognizeWords(String gcsUri) throws Exception { } /** - * Performs non-blocking speech recognition on remote FLAC file and prints - * the transcription. + * Performs non-blocking speech recognition on remote FLAC file and prints the transcription. * * @param gcsUri the path to the remote LINEAR16 audio file to transcribe. */ @@ -320,14 +340,13 @@ public static void asyncRecognizeGcs(String gcsUri) throws Exception { try (SpeechClient speech = SpeechClient.create()) { // Configure remote file request for Linear16 - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .build(); - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); // Use non-blocking call for getting file transcription OperationFuture response = @@ -348,7 +367,6 @@ public static void asyncRecognizeGcs(String gcsUri) throws Exception { } } - /** * Performs streaming speech recognition on raw PCM audio data. * @@ -362,15 +380,15 @@ public static void streamingRecognizeFile(String fileName) throws Exception, IOE try (SpeechClient speech = SpeechClient.create()) { // Configure request with local raw PCM audio - RecognitionConfig recConfig = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setModel("default") - .build(); - StreamingRecognitionConfig config = StreamingRecognitionConfig.newBuilder() - .setConfig(recConfig) - .build(); + RecognitionConfig recConfig = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setModel("default") + .build(); + StreamingRecognitionConfig config = + StreamingRecognitionConfig.newBuilder().setConfig(recConfig).build(); class ResponseApiStreamingObserver implements ApiStreamObserver { private final SettableFuture> future = SettableFuture.create(); @@ -407,14 +425,14 @@ public SettableFuture> future() { callable.bidiStreamingCall(responseObserver); // The first request must **only** contain the audio configuration: - requestObserver.onNext(StreamingRecognizeRequest.newBuilder() - .setStreamingConfig(config) - .build()); + requestObserver.onNext( + StreamingRecognizeRequest.newBuilder().setStreamingConfig(config).build()); // Subsequent requests must **only** contain the audio data. - requestObserver.onNext(StreamingRecognizeRequest.newBuilder() - .setAudioContent(ByteString.copyFrom(data)) - .build()); + requestObserver.onNext( + StreamingRecognizeRequest.newBuilder() + .setAudioContent(ByteString.copyFrom(data)) + .build()); // Mark transmission as completed after sending the data. requestObserver.onCompleted(); @@ -436,8 +454,8 @@ public SettableFuture> future() { // [START speech_transcribe_model_selection] /** - * Performs transcription of the given audio file synchronously with - * the selected model. + * Performs transcription of the given audio file synchronously with the selected model. + * * @param fileName the path to a audio file to transcribe */ public static void transcribeModelSelection(String fileName) throws Exception { @@ -446,19 +464,19 @@ public static void transcribeModelSelection(String fileName) throws Exception { try (SpeechClient speech = SpeechClient.create()) { // Configure request with video media type - RecognitionConfig recConfig = RecognitionConfig.newBuilder() - // encoding may either be omitted or must match the value in the file header - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - // sample rate hertz may be either be omitted or must match the value in the file header - .setSampleRateHertz(16000) - .setModel("video") - .build(); - - RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() - .setContent(ByteString.copyFrom(content)) - .build(); - + RecognitionConfig recConfig = + RecognitionConfig.newBuilder() + // encoding may either be omitted or must match the value in the file header + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + // sample rate hertz may be either be omitted or must match the value in the file + // header + .setSampleRateHertz(16000) + .setModel("video") + .build(); + + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); RecognizeResponse recognizeResponse = speech.recognize(recConfig, recognitionAudio); // Just print the first result here. @@ -473,26 +491,26 @@ public static void transcribeModelSelection(String fileName) throws Exception { // [START speech_transcribe_model_selection_gcs] /** - * Performs transcription of the remote audio file asynchronously with - * the selected model. + * Performs transcription of the remote audio file asynchronously with the selected model. + * * @param gcsUri the path to the remote audio file to transcribe. */ public static void transcribeModelSelectionGcs(String gcsUri) throws Exception { try (SpeechClient speech = SpeechClient.create()) { // Configure request with video media type - RecognitionConfig config = RecognitionConfig.newBuilder() - // encoding may either be omitted or must match the value in the file header - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - // sample rate hertz may be either be omitted or must match the value in the file header - .setSampleRateHertz(16000) - .setModel("video") - .build(); - - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + // encoding may either be omitted or must match the value in the file header + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + // sample rate hertz may be either be omitted or must match the value in the file + // header + .setSampleRateHertz(16000) + .setModel("video") + .build(); + + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); // Use non-blocking call for getting file transcription OperationFuture response = @@ -527,17 +545,17 @@ public static void transcribeFileWithAutomaticPunctuation(String fileName) throw try (SpeechClient speechClient = SpeechClient.create()) { // Configure request with local raw PCM audio - RecognitionConfig recConfig = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setEnableAutomaticPunctuation(true) - .build(); + RecognitionConfig recConfig = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableAutomaticPunctuation(true) + .build(); // Get the contents of the local audio file - RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() - .setContent(ByteString.copyFrom(content)) - .build(); + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); // Perform the transcription request RecognizeResponse recognizeResponse = speechClient.recognize(recConfig, recognitionAudio); @@ -564,17 +582,16 @@ public static void transcribeFileWithAutomaticPunctuation(String fileName) throw public static void transcribeGcsWithAutomaticPunctuation(String gcsUri) throws Exception { try (SpeechClient speechClient = SpeechClient.create()) { // Configure request with raw PCM audio - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.FLAC) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setEnableAutomaticPunctuation(true) - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableAutomaticPunctuation(true) + .build(); // Set the remote path for the audio file - RecognitionAudio audio = RecognitionAudio.newBuilder() - .setUri(gcsUri) - .build(); + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); // Use non-blocking call for getting file transcription OperationFuture response = @@ -612,17 +629,17 @@ public static void streamingTranscribeWithAutomaticPunctuation(String fileName) try (SpeechClient speech = SpeechClient.create()) { // Configure request with local raw PCM audio - RecognitionConfig recConfig = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(16000) - .setEnableAutomaticPunctuation(true) - .build(); + RecognitionConfig recConfig = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(16000) + .setEnableAutomaticPunctuation(true) + .build(); // Build the streaming config with the audio config - StreamingRecognitionConfig config = StreamingRecognitionConfig.newBuilder() - .setConfig(recConfig) - .build(); + StreamingRecognitionConfig config = + StreamingRecognitionConfig.newBuilder().setConfig(recConfig).build(); class ResponseApiStreamingObserver implements ApiStreamObserver { private final SettableFuture> future = SettableFuture.create(); @@ -659,14 +676,14 @@ public SettableFuture> future() { callable.bidiStreamingCall(responseObserver); // The first request must **only** contain the audio configuration: - requestObserver.onNext(StreamingRecognizeRequest.newBuilder() - .setStreamingConfig(config) - .build()); + requestObserver.onNext( + StreamingRecognizeRequest.newBuilder().setStreamingConfig(config).build()); // Subsequent requests must **only** contain the audio data. - requestObserver.onNext(StreamingRecognizeRequest.newBuilder() - .setAudioContent(ByteString.copyFrom(data)) - .build()); + requestObserver.onNext( + StreamingRecognizeRequest.newBuilder() + .setAudioContent(ByteString.copyFrom(data)) + .build()); // Mark transmission as completed after sending the data. requestObserver.onCompleted(); @@ -699,21 +716,21 @@ public static void transcribeFileWithEnhancedModel(String fileName) throws Excep try (SpeechClient speechClient = SpeechClient.create()) { // Get the contents of the local audio file - RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() - .setContent(ByteString.copyFrom(content)) - .build(); + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); // Configure request to enable enhanced models - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(8000) - // Enhanced models are only available to projects that - // opt in for audio data collection. - .setUseEnhanced(true) - // A model must be specified to use enhanced model. - .setModel("phone_call") - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + // Enhanced models are only available to projects that + // opt in for audio data collection. + .setUseEnhanced(true) + // A model must be specified to use enhanced model. + .setModel("phone_call") + .build(); // Perform the transcription request RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); @@ -741,30 +758,31 @@ public static void transcribeFileWithMetadata(String fileName) throws Exception try (SpeechClient speechClient = SpeechClient.create()) { // Get the contents of the local audio file - RecognitionAudio recognitionAudio = RecognitionAudio.newBuilder() - .setContent(ByteString.copyFrom(content)) - .build(); + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); // Construct a recognition metadata object. // Most metadata fields are specified as enums that can be found // in speech.enums.RecognitionMetadata - RecognitionMetadata metadata = RecognitionMetadata.newBuilder() - .setInteractionType(InteractionType.DISCUSSION) - .setMicrophoneDistance(MicrophoneDistance.NEARFIELD) - .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE) - .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings - // And some are integers, for instance the 6 digit NAICS code - // https://www.naics.com/search/ - .setIndustryNaicsCodeOfAudio(519190) - .build(); + RecognitionMetadata metadata = + RecognitionMetadata.newBuilder() + .setInteractionType(InteractionType.DISCUSSION) + .setMicrophoneDistance(MicrophoneDistance.NEARFIELD) + .setRecordingDeviceType(RecordingDeviceType.SMARTPHONE) + .setRecordingDeviceName("Pixel 2 XL") // Some metadata fields are free form strings + // And some are integers, for instance the 6 digit NAICS code + // https://www.naics.com/search/ + .setIndustryNaicsCodeOfAudio(519190) + .build(); // Configure request to enable enhanced models - RecognitionConfig config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setLanguageCode("en-US") - .setSampleRateHertz(8000) - .setMetadata(metadata) // Add the metadata to the config - .build(); + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setMetadata(metadata) // Add the metadata to the config + .build(); // Perform the transcription request RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); @@ -779,4 +797,355 @@ public static void transcribeFileWithMetadata(String fileName) throws Exception } } // [END speech_transcribe_file_with_metadata] + + // [START speech_transcribe_diarization] + /** + * Transcribe the given audio file using speaker diarization. + * + * @param fileName the path to an audio file. + */ + public static void transcribeDiarization(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + + // Configure request to enable Speaker diarization + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setEnableSpeakerDiarization(true) + .setDiarizationSpeakerCount(2) + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just + // use the first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n", alternative.getTranscript()); + // The words array contains the entire transcript up until that point. + //Referencing the last spoken word to get the associated Speaker tag + System.out.format("Speaker Tag %s: %s\n", + alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), + alternative.getTranscript()); + } + } + } + // [END speech_transcribe_diarization] + + // [START speech_transcribe_diarization_gcs] + /** + * Transcribe a remote audio file using speaker diarization. + * + * @param gcsUri the path to an audio file. + */ + public static void transcribeDiarizationGcs(String gcsUri) throws Exception { + try (SpeechClient speechClient = SpeechClient.create()) { + // Configure request to enable Speaker diarization + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(8000) + .setEnableSpeakerDiarization(true) + .setDiarizationSpeakerCount(2) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + + for (SpeechRecognitionResult result : response.get().getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just + // use the first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + // The words array contains the entire transcript up until that point. + //Referencing the last spoken word to get the associated Speaker tag + System.out.format("Speaker Tag %s:%s\n", + alternative.getWords((alternative.getWordsCount() - 1)).getSpeakerTag(), + alternative.getTranscript()); + } + } + } + + // [END speech_transcribe_diarization_gcs] + + // [START speech_transcribe_multichannel] + + /** + * Transcribe a local audio file with multi-channel recognition + * + * @param fileName the path to local audio file + */ + public static void transcribeMultiChannel(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + // Get the contents of the local audio file + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + + // Configure request to enable multiple channels + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(44100) + .setAudioChannelCount(2) + .setEnableSeparateRecognitionPerChannel(true) + .build(); + + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n", alternative.getTranscript()); + System.out.printf("Channel Tag : %s\n\n", result.getChannelTag()); + } + } + } + // [END speech_transcribe_multichannel] + + // [START speech_transcribe_multichannel_gcs] + + /** + * Transcribe a remote audio file with multi-channel recognition + * + * @param gcsUri the path to the audio file + */ + public static void transcribeMultiChannelGcs(String gcsUri) throws Exception { + + try (SpeechClient speechClient = SpeechClient.create()) { + + // Configure request to enable multiple channels + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setLanguageCode("en-US") + .setSampleRateHertz(44100) + .setAudioChannelCount(2) + .setEnableSeparateRecognitionPerChannel(true) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + // Just print the first result here. + for (SpeechRecognitionResult result : response.get().getResultsList()) { + + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + + // Print out the result + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + System.out.printf("Channel Tag : %s\n\n", result.getChannelTag()); + } + } + } + // [END speech_transcribe_multichannel_gcs] + + // [START speech_transcribe_multilang] + + /** + * Transcribe a local audio file with multi-language recognition + * + * @param fileName the path to the audio file + */ + public static void transcribeMultiLanguage(String fileName) throws Exception { + Path path = Paths.get(fileName); + // Get the contents of the local audio file + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + ArrayList languageList = new ArrayList<>(); + languageList.add("es-ES"); + languageList.add("en-US"); + + // Configure request to enable multiple languages + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("ja-JP") + .addAllAlternativeLanguageCodes(languageList) + .build(); + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n\n", alternative.getTranscript()); + } + } + } + // [END speech_transcribe_multilang] + + // [START speech_transcribe_multilang_gcs] + + /** + * Transcribe a remote audio file with multi-language recognition + * + * @param gcsUri the path to the remote audio file + */ + public static void transcribeMultiLanguageGcs(String gcsUri) throws Exception { + try (SpeechClient speechClient = SpeechClient.create()) { + + ArrayList languageList = new ArrayList<>(); + languageList.add("es-ES"); + languageList.add("en-US"); + + // Configure request to enable multiple languages + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("ja-JP") + .addAllAlternativeLanguageCodes(languageList) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + + for (SpeechRecognitionResult result : response.get().getResultsList()) { + + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + + // Print out the result + System.out.printf("Transcript : %s\n\n", alternative.getTranscript()); + } + } + } + // [END speech_transcribe_multilang_gcs] + + // [START speech_transcribe_word_level_confidence] + + /** + * Transcribe a local audio file with word level confidence + * + * @param fileName the path to the local audio file + */ + public static void transcribeWordLevelConfidence(String fileName) throws Exception { + Path path = Paths.get(fileName); + byte[] content = Files.readAllBytes(path); + + try (SpeechClient speechClient = SpeechClient.create()) { + RecognitionAudio recognitionAudio = + RecognitionAudio.newBuilder().setContent(ByteString.copyFrom(content)).build(); + // Configure request to enable word level confidence + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("en-US") + .setEnableWordConfidence(true) + .build(); + // Perform the transcription request + RecognizeResponse recognizeResponse = speechClient.recognize(config, recognitionAudio); + + // Print out the results + for (SpeechRecognitionResult result : recognizeResponse.getResultsList()) { + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternatives(0); + System.out.format("Transcript : %s\n", alternative.getTranscript()); + System.out.format( + "First Word and Confidence : %s %s \n", + alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence()); + } + } + } + // [END speech_transcribe_word_level_confidence] + + // [START speech_transcribe_word_level_confidence_gcs] + + /** + * Transcribe a remote audio file with word level confidence + * + * @param gcsUri path to the remote audio file + */ + public static void transcribeWordLevelConfidenceGcs(String gcsUri) throws Exception { + try (SpeechClient speechClient = SpeechClient.create()) { + + // Configure request to enable word level confidence + RecognitionConfig config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.FLAC) + .setSampleRateHertz(16000) + .setLanguageCode("en-US") + .setEnableWordConfidence(true) + .build(); + + // Set the remote path for the audio file + RecognitionAudio audio = RecognitionAudio.newBuilder().setUri(gcsUri).build(); + + // Use non-blocking call for getting file transcription + OperationFuture response = + speechClient.longRunningRecognizeAsync(config, audio); + + while (!response.isDone()) { + System.out.println("Waiting for response..."); + Thread.sleep(10000); + } + // Just print the first result here. + SpeechRecognitionResult result = response.get().getResultsList().get(0); + + // There can be several alternative transcripts for a given chunk of speech. Just use the + // first (most likely) one here. + SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0); + // Print out the result + System.out.printf("Transcript : %s\n", alternative.getTranscript()); + System.out.format( + "First Word and Confidence : %s %s \n", + alternative.getWords(0).getWord(), alternative.getWords(0).getConfidence()); + } + } + // [END speech_transcribe_word_level_confidence_gcs] } diff --git a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java index 983d3a8724f..2a36ac3922a 100644 --- a/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java +++ b/speech/cloud-client/src/test/java/com/example/speech/RecognizeIT.java @@ -26,21 +26,21 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; -/** - * Tests for speech recognize sample. - */ +/** Tests for speech recognize sample. */ @RunWith(JUnit4.class) @SuppressWarnings("checkstyle:abbreviationaswordinname") public class RecognizeIT { - private static final String PROJECT_ID = System.getenv("GOOGLE_CLOUD_PROJECT"); - private static final String BUCKET = PROJECT_ID; + private static final String BUCKET = "cloud-samples-tests"; private ByteArrayOutputStream bout; private PrintStream out; // The path to the audio file to transcribe private String audioFileName = "./resources/audio.raw"; + private String multiChannelAudioFileName = "./resources/commercial_stereo.wav"; + private String gcsMultiChannelAudioPath = "gs://" + BUCKET + "/speech/commercial_stereo.wav"; private String gcsAudioPath = "gs://" + BUCKET + "/speech/brooklyn.flac"; + private String gcsDiarizationAudioPath = "gs://" + BUCKET + "/speech/commercial_mono.wav"; // The path to the video file to transcribe private String videoFileName = "./resources/Google_Gnome.wav"; @@ -161,4 +161,62 @@ public void testMetadata() throws Exception { String got = bout.toString(); assertThat(got).contains("Chrome"); } + + @Test + public void testTranscribeDiarization() throws Exception { + Recognize.transcribeDiarization(recognitionAudioFile); + String got = bout.toString(); + assertThat(got).contains("Speaker Tag 2:"); + } + + @Test + public void testTranscribeDiarizationGcs() throws Exception { + Recognize.transcribeDiarizationGcs(gcsDiarizationAudioPath); + String got = bout.toString(); + assertThat(got).contains("Speaker Tag 2:"); + } + + @Test + public void testTranscribeMultiChannel() throws Exception { + Recognize.transcribeMultiChannel(multiChannelAudioFileName); + String got = bout.toString(); + assertThat(got).contains("Channel Tag : 1"); + } + + @Test + public void testTranscribeMultiChannelGcs() throws Exception { + Recognize.transcribeMultiChannelGcs(gcsMultiChannelAudioPath); + String got = bout.toString(); + assertThat(got).contains("Channel Tag : 1"); + } + + @Test + public void testTranscribeMultiLanguage() throws Exception { + Recognize.transcribeMultiLanguage(videoFileName); + String got = bout.toString(); + assertThat(got).contains("Transcript : OK Google"); + } + + @Test + public void testTranscribeMultiLanguageGcs() throws Exception { + Recognize.transcribeMultiLanguageGcs(gcsVideoPath); + String got = bout.toString(); + assertThat(got).contains("Transcript : OK Google"); + } + + @Test + public void testTranscribeWordLevelConfidence() throws Exception { + Recognize.transcribeWordLevelConfidence(audioFileName); + String got = bout.toString(); + assertThat(got).contains("Transcript : how old is the Brooklyn Bridge"); + assertThat(got).contains("First Word and Confidence : how"); + } + + @Test + public void testTranscribeWordLevelConfidenceGcs() throws Exception { + Recognize.transcribeWordLevelConfidenceGcs(gcsAudioPath); + String got = bout.toString(); + assertThat(got).contains("Transcript : how old is the Brooklyn Bridge"); + assertThat(got).contains("First Word and Confidence : how"); + } }