Merge pull request #249 from GoogleCloudPlatform/gcs-audio-input

added gcs read for audio file
GoogleCloudPlatform · Jun 2, 2016 · eb3310f · eb3310f
2 parents e2a9208 + babbf11
commit eb3310f
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 30 deletions.
diff --git a/speech/grpc/pom.xml b/speech/grpc/pom.xml
@@ -111,6 +111,12 @@ limitations under the License.
 
   <!-- // [START dependency] -->
   <dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.12</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>

diff --git a/speech/grpc/src/main/java/com/google/cloud/speech/grpc/demos/AudioRequestFactory.java b/speech/grpc/src/main/java/com/google/cloud/speech/grpc/demos/AudioRequestFactory.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2016 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package com.google.cloud.speech.grpc.demos;
+
+import com.google.cloud.speech.v1.AudioRequest;
+import com.google.protobuf.ByteString;
+
+import java.io.IOException;
+import java.net.URI;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+/*
+ * AudioRequestFactory takes a URI as an input and creates an AudioRequest. The URI can point to a
+ * local file or a file on Google Cloud Storage.
+ */
+public class AudioRequestFactory {
+
+  private static final String FILE_SCHEME = "file";
+  private static final String GS_SCHEME   = "gs";
+
+  /**
+   * Takes an input URI of form $scheme:// and converts to audio request.
+   *
+   * @param uri input uri
+   * @return AudioRequest audio request
+   */
+  public static AudioRequest createRequest(URI uri)
+      throws IOException {
+    if (uri.getScheme() == null || uri.getScheme().equals(FILE_SCHEME)) {
+      Path path = Paths.get(uri);
+      return audioFromBytes(Files.readAllBytes(path));
+    } else if (uri.getScheme().equals(GS_SCHEME)) {
+      return AudioRequest.newBuilder().setUri(uri.toString()).build();
+    }
+    throw new RuntimeException("scheme not supported " + uri.getScheme());
+  }
+
+  /**
+   * Convert bytes to AudioRequest.
+   *
+   * @param bytes input bytes
+   * @return AudioRequest audio request
+   */
+  private static AudioRequest audioFromBytes(byte[] bytes) {
+    return AudioRequest.newBuilder()
+        .setContent(ByteString.copyFrom(bytes))
+        .build();
+  }
+}
diff --git a/...ch/grpc/src/main/java/com/google/cloud/speech/grpc/demos/NonStreamingRecognizeClient.java b/...ch/grpc/src/main/java/com/google/cloud/speech/grpc/demos/NonStreamingRecognizeClient.java
@@ -32,7 +32,6 @@
 import com.google.cloud.speech.v1.NonStreamingRecognizeResponse;
 import com.google.cloud.speech.v1.RecognizeRequest;
 import com.google.cloud.speech.v1.SpeechGrpc;
-import com.google.protobuf.ByteString;
 import com.google.protobuf.TextFormat;
 
 import io.grpc.ManagedChannel;
@@ -49,9 +48,7 @@
 import org.apache.commons.cli.ParseException;
 
 import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.net.URI;
 import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.Executors;
@@ -72,7 +69,7 @@ public class NonStreamingRecognizeClient {
 
   private final String host;
   private final int port;
-  private final String file;
+  private final URI input;
   private final int samplingRate;
 
   private final ManagedChannel channel;
@@ -81,11 +78,11 @@ public class NonStreamingRecognizeClient {
   /**
    * Construct client connecting to Cloud Speech server at {@code host:port}.
    */
-  public NonStreamingRecognizeClient(String host, int port, String file, int samplingRate)
+  public NonStreamingRecognizeClient(String host, int port, URI input, int samplingRate)
       throws IOException {
     this.host = host;
     this.port = port;
-    this.file = file;
+    this.input = input;
     this.samplingRate = samplingRate;
 
     GoogleCredentials creds = GoogleCredentials.getApplicationDefault();
@@ -99,10 +96,7 @@ public NonStreamingRecognizeClient(String host, int port, String file, int sampl
   }
 
   private AudioRequest createAudioRequest() throws IOException {
-    Path path = Paths.get(file);
-    return AudioRequest.newBuilder()
-        .setContent(ByteString.copyFrom(Files.readAllBytes(path)))
-        .build();
+    return AudioRequestFactory.createRequest(this.input);
   }
 
   public void shutdown() throws InterruptedException {
@@ -115,10 +109,10 @@ public void recognize() {
     try {
       audio = createAudioRequest();
     } catch (IOException e) {
-      logger.log(Level.WARNING, "Failed to read audio file: " + file);
+      logger.log(Level.WARNING, "Failed to read audio uri input: " + input);
       return;
     }
-    logger.info("Sending " + audio.getContent().size() + " bytes from audio file: " + file);
+    logger.info("Sending " + audio.getContent().size() + " bytes from audio uri input: " + input);
     InitialRecognizeRequest initial = InitialRecognizeRequest.newBuilder()
         .setEncoding(AudioEncoding.LINEAR16)
         .setSampleRate(samplingRate)
@@ -147,8 +141,8 @@ public static void main(String[] args) throws Exception {
     CommandLineParser parser = new DefaultParser();
 
     Options options = new Options();
-    options.addOption(OptionBuilder.withLongOpt("file")
-        .withDescription("path to audio file")
+    options.addOption(OptionBuilder.withLongOpt("uri")
+        .withDescription("path to audio uri")
         .hasArg()
         .withArgName("FILE_PATH")
         .create());
@@ -170,10 +164,10 @@ public static void main(String[] args) throws Exception {
 
     try {
       CommandLine line = parser.parse(options, args);
-      if (line.hasOption("file")) {
-        audioFile = line.getOptionValue("file");
+      if (line.hasOption("uri")) {
+        audioFile = line.getOptionValue("uri");
       } else {
-        System.err.println("An Audio file path must be specified (e.g. /foo/baz.raw).");
+        System.err.println("An Audio uri must be specified (e.g. file:///foo/baz.raw).");
         System.exit(1);
       }
 
@@ -203,7 +197,7 @@ public static void main(String[] args) throws Exception {
     }
 
     NonStreamingRecognizeClient client =
-        new NonStreamingRecognizeClient(host, port, audioFile, sampling);
+        new NonStreamingRecognizeClient(host, port, URI.create(audioFile), sampling);
     try {
       client.recognize();
     } finally {

diff --git a/speech/grpc/src/main/proto/google/speech/v1/cloud-speech.proto b/speech/grpc/src/main/proto/google/speech/v1/cloud-speech.proto
@@ -23,14 +23,15 @@ option java_multiple_files = true;
 option java_outer_classname = "SpeechProto";
 option java_package = "com.google.cloud.speech.v1";
 
+
 // Service that implements Google Cloud Speech API.
 service Speech {
   // Perform bidirectional streaming speech recognition on audio using gRPC.
   rpc Recognize(stream RecognizeRequest) returns (stream RecognizeResponse);
 
   // Perform non-streaming speech recognition on audio using HTTPS.
   rpc NonStreamingRecognize(RecognizeRequest) returns (NonStreamingRecognizeResponse) {
-    option (.google.api.http) = { post: "/v1/speech:recognize" body: "*" };
+    option (google.api.http) = { post: "/v1/speech:recognize" body: "*" };
   }
 }
 
@@ -54,7 +55,7 @@ message RecognizeRequest {
 
   // The audio data to be recognized. For `NonStreamingRecognize`, all the
   // audio data must be contained in the first (and only) `RecognizeRequest`
-  //  message. For streaming `Recognize`, sequential chunks of audio data are
+  // message. For streaming `Recognize`, sequential chunks of audio data are
   // sent in sequential `RecognizeRequest` messages.
   AudioRequest audio_request = 2;
 }
@@ -64,7 +65,7 @@ message RecognizeRequest {
 message InitialRecognizeRequest {
   // Audio encoding of the data sent in the audio message.
   enum AudioEncoding {
-    // Not specified. Will return result `INVALID_ARGUMENT`.
+    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
     ENCODING_UNSPECIFIED = 0;
 
     // Uncompressed 16-bit signed little-endian samples.
@@ -118,8 +119,6 @@ message InitialRecognizeRequest {
   // profanities, replacing all but the initial character in each filtered word
   // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
   // won't be filtered out.
-  // Note that profanity filtering is not implemented for all languages.
-  // If the language is not supported, this setting has no effect.
   bool profanity_filter = 5;
 
   // [Optional] If `false` or omitted, the recognizer will detect a single
@@ -146,13 +145,38 @@ message InitialRecognizeRequest {
   // as they become available.
   // If `false` or omitted, no `EndpointerEvents` are returned.
   bool enable_endpointer_events = 8;
+
+  // [Optional] URI that points to a file where the recognition result should
+  // be stored in JSON format. If omitted or empty string, the recognition
+  // result is returned in the response. Should be specified only for
+  // `NonStreamingRecognize`. If specified in a `Recognize` request,
+  // `Recognize` returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
+  // If specified in a `NonStreamingRecognize` request,
+  // `NonStreamingRecognize` returns immediately, and the output file
+  // is created asynchronously once the audio processing completes.
+  // Currently, only Google Cloud Storage URIs are supported, which must be
+  // specified in the following format: `gs://bucket_name/object_name`
+  // (other URI formats return [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
+  // more information, see [Request URIs](/storage/docs/reference-uris).
+  string output_uri = 9;
 }
 
 // Contains audio data in the format specified in the `InitialRecognizeRequest`.
+// Either `content` or `uri` must be supplied. Supplying both or neither
+// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
 message AudioRequest {
-  // [Required] The audio data bytes encoded as specified in
-  // `InitialRecognizeRequest`.
+  // The audio data bytes encoded as specified in
+  // `InitialRecognizeRequest`. Note: as with all bytes fields, protobuffers
+  // use a pure binary representation, whereas JSON representations use base64.
   bytes content = 1;
+
+  // URI that points to a file that contains audio data bytes as specified in
+  // `InitialRecognizeRequest`. Currently, only Google Cloud Storage URIs are
+  // supported, which must be specified in the following format:
+  // `gs://bucket_name/object_name` (other URI formats return
+  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
+  // [Request URIs](/storage/docs/reference-uris).
+  string uri = 2;
 }
 
 // `NonStreamingRecognizeResponse` is the only message returned to the client by
@@ -191,10 +215,14 @@ message RecognizeResponse {
 
   // [Output-only] If set, returns a [google.rpc.Status][] message that
   // specifies the error for the operation.
-  .google.rpc.Status error = 1;
-
-  // [Output-only] May contain zero or one `is_final=true` result (the newly
-  // settled portion). May also contain zero or more `is_final=false` results.
+  google.rpc.Status error = 1;
+
+  // [Output-only] For `continuous=false`, this repeated list contains zero or
+  // one result that corresponds to all of the audio processed so far. For
+  // `continuous=true`, this repeated list contains zero or more results that
+  // correspond to consecutive portions of the audio being processed.
+  // In both cases, contains zero or one `is_final=true` result (the newly
+  // settled portion), followed by zero or more `is_final=false` results.
   repeated SpeechRecognitionResult results = 2;
 
   // [Output-only] Indicates the lowest index in the `results` array that has
@@ -206,7 +234,10 @@ message RecognizeResponse {
   EndpointerEvent endpoint = 4;
 }
 
+// A speech recognition result corresponding to a portion of the audio.
 message SpeechRecognitionResult {
+  // [Output-only] May contain one or more recognition hypotheses (up to the
+  // maximum specified in `max_alternatives`).
   repeated SpeechRecognitionAlternative alternatives = 1;
 
   // [Output-only] Set `true` if this is the final time the speech service will

diff --git a/speech/grpc/src/test/java/com/google/cloud/speech/grpc/demos/AudioRequestFactoryTest.java b/speech/grpc/src/test/java/com/google/cloud/speech/grpc/demos/AudioRequestFactoryTest.java
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2016 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.cloud.speech.grpc.demos;
+
+import static org.junit.Assert.assertEquals;
+
+import com.google.cloud.speech.v1.AudioRequest;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+
+/**
+ * Unit tests for {@link AudioRequestFactory}.
+ */
+@RunWith(JUnit4.class)
+public class AudioRequestFactoryTest {
+
+  @Test
+  public void verifyBytesInSizeFromLocalFile() throws IOException {
+    URI uri = new File("resources/audio.raw").toURI();
+    AudioRequest audio = AudioRequestFactory.createRequest(uri);
+
+    int numBytes = audio.getContent().toByteArray().length;
+
+    //assert the number of bytes in the audio as 57958
+    assertEquals(57958, numBytes);
+  }
+
+  @Test
+  public void verifyBytesInSizeFromGoogleStorageFile() throws IOException {
+    String audioUri = "gs://cloud-samples-tests/speech/audio.raw";
+
+    URI uri = URI.create(audioUri);
+    AudioRequest audio = AudioRequestFactory.createRequest(uri);
+
+    int numBytes = audio.getContent().toByteArray().length;
+
+    //assert the number of bytes in the audio as 0
+    assertEquals(0, numBytes);
+
+    //assert the uri
+    assertEquals(audioUri, audio.getUri());
+  }
+}