Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to highlight word time offsets #787

Merged
merged 1 commit into from
Aug 3, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions speech/cloud-client/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,15 @@ Build your project with:
java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
com.example.speech.Recognize asyncrecognize gs://cloud-samples-tests/speech/vr.flac
```

### Synchronously transcribe an audio file and print word offsets
```
java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
com.example.speech.Recognize wordoffsets ./resources/audio.raw
```

### Asynchronously transcribe a remote audio file and print word offsets
```
java -cp target/speech-google-cloud-samples-1.0.0-jar-with-dependencies.jar \
com.example.speech.Recognize wordoffsets gs://cloud-samples-tests/speech/vr.flac
```
7 changes: 4 additions & 3 deletions speech/cloud-client/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@

<!-- Parent defines config for testing & linting. -->
<parent>
<groupId>com.google.cloud.samples</groupId>
<artifactId>shared-configuration</artifactId>
<version>1.0.5</version>
<artifactId>doc-samples</artifactId>
<groupId>com.google.cloud</groupId>
<version>1.0.0</version>
<relativePath>../..</relativePath>
</parent>

<properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public static void main(String... args) throws Exception {
System.out.printf(
"\tjava %s \"<command>\" \"<path-to-image>\"\n"
+ "Commands:\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize\n"
+ "\tsyncrecognize | asyncrecognize | streamrecognize | wordoffsets\n"
+ "Path:\n\tA file path (ex: ./resources/audio.raw) or a URI "
+ "for a Cloud Storage resource (gs://...)\n",
Recognize.class.getCanonicalName());
Expand All @@ -66,6 +66,12 @@ public static void main(String... args) throws Exception {
} else {
syncRecognizeFile(path);
}
} else if (command.equals("wordoffsets")) {
if (path.startsWith("gs://")) {
asyncRecognizeWords(path);
} else {
syncRecognizeWords(path);
}
} else if (command.equals("asyncrecognize")) {
if (path.startsWith("gs://")) {
asyncRecognizeGcs(path);
Expand Down Expand Up @@ -113,6 +119,51 @@ public static void syncRecognizeFile(String fileName) throws Exception, IOExcept
speech.close();
}

/**
* Performs sync recognize and prints word time offsets.
*
* @param fileName the path to a PCM audio file to transcribe get offsets on.
*/
public static void syncRecognizeWords(String fileName) throws Exception, IOException {
SpeechClient speech = SpeechClient.create();

Path path = Paths.get(fileName);
byte[] data = Files.readAllBytes(path);
ByteString audioBytes = ByteString.copyFrom(data);

// Configure request with local raw PCM audio
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(16000)
.setEnableWordTimeOffsets(true)
.build();
RecognitionAudio audio = RecognitionAudio.newBuilder()
.setContent(audioBytes)
.build();

// Use blocking call to get audio transcript
RecognizeResponse response = speech.recognize(config, audio);
List<SpeechRecognitionResult> results = response.getResultsList();

for (SpeechRecognitionResult result: results) {
List<SpeechRecognitionAlternative> alternatives = result.getAlternativesList();
for (SpeechRecognitionAlternative alternative: alternatives) {
System.out.printf("Transcription: %s%n", alternative.getTranscript());
for (WordInfo wordInfo: alternative.getWordsList()) {
System.out.println(wordInfo.getWord());
System.out.printf("\t%s.%s sec - %s.%s sec\n",
wordInfo.getStartTime().getSeconds(),
wordInfo.getStartTime().getNanos() / 100000000,
wordInfo.getEndTime().getSeconds(),
wordInfo.getEndTime().getNanos() / 100000000);
}
}
}
speech.close();
}


/**
* Performs speech recognition on remote FLAC file and prints the transcription.
*
Expand Down Expand Up @@ -193,11 +244,11 @@ public static void asyncRecognizeFile(String fileName) throws Exception, IOExcep

/**
* Performs non-blocking speech recognition on remote FLAC file and prints
* the transcription.
* the transcription as well as word time offsets.
*
* @param gcsUri the path to the remote LINEAR16 audio file to transcribe.
*/
public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException {
public static void asyncRecognizeWords(String gcsUri) throws Exception, IOException {
// Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
SpeechClient speech = SpeechClient.create();

Expand Down Expand Up @@ -240,6 +291,47 @@ public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOExceptio
speech.close();
}

/**
* Performs non-blocking speech recognition on remote FLAC file and prints
* the transcription.
*
* @param gcsUri the path to the remote LINEAR16 audio file to transcribe.
*/
public static void asyncRecognizeGcs(String gcsUri) throws Exception, IOException {
// Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
SpeechClient speech = SpeechClient.create();

// Configure remote file request for Linear16
RecognitionConfig config = RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.FLAC)
.setLanguageCode("en-US")
.setSampleRateHertz(16000)
.build();
RecognitionAudio audio = RecognitionAudio.newBuilder()
.setUri(gcsUri)
.build();

// Use non-blocking call for getting file transcription
OperationFuture<LongRunningRecognizeResponse, LongRunningRecognizeMetadata,
Operation> response =
speech.longRunningRecognizeAsync(config, audio);
while (!response.isDone()) {
System.out.println("Waiting for response...");
Thread.sleep(10000);
}

List<SpeechRecognitionResult> results = response.get().getResultsList();

for (SpeechRecognitionResult result: results) {
List<SpeechRecognitionAlternative> alternatives = result.getAlternativesList();
for (SpeechRecognitionAlternative alternative: alternatives) {
System.out.printf("Transcription: %s\n",alternative.getTranscript());
}
}
speech.close();
}


/**
* Performs streaming speech recognition on raw PCM audio data.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ public void testRecognizeFile() throws Exception {
assertThat(got).contains("how old is the Brooklyn Bridge");
}

@Test
public void testRecognizeWordoffset() throws Exception {
Recognize.syncRecognizeWords(fileName);
String got = bout.toString();
assertThat(got).contains("how old is the Brooklyn Bridge");
assertThat(got).contains("\t0.0 sec -");
}

@Test
public void testRecognizeGcs() throws Exception {
Recognize.syncRecognizeGcs(gcsPath);
Expand All @@ -85,8 +93,9 @@ public void testAsyncRecognizeGcs() throws Exception {

@Test
public void testAsyncWordoffset() throws Exception {
Recognize.asyncRecognizeGcs(gcsPath);
Recognize.asyncRecognizeWords(gcsPath);
String got = bout.toString();
assertThat(got).contains("how old is the Brooklyn Bridge");
assertThat(got).contains("\t0.0 sec -");
}

Expand Down