diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference.py index 542b0d51108e..7cae68c071f2 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference.py @@ -44,7 +44,7 @@ def adaptation_v2_custom_class_reference(project_id, recognizer_id, phrase_set_i request = cloud_speech.CreateCustomClassRequest( parent=f"projects/{project_id}/locations/global", custom_class_id=custom_class_id, - custom_class=cloud_speech.CustomClass(items=[{"value": "Keem"}])) + custom_class=cloud_speech.CustomClass(items=[{"value": "fare"}])) operation = client.create_custom_class(request=request) custom_class = operation.result() @@ -70,10 +70,6 @@ def adaptation_v2_custom_class_reference(project_id, recognizer_id, phrase_set_i auto_decoding_config={}, adaptation=adaptation ) - print(custom_class) - print(phrase_set) - print(config) - request = cloud_speech.RecognizeRequest( recognizer=recognizer.name, config=config, content=content ) diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference_test.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference_test.py index a76aa6c9cf30..b869f4405519 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference_test.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_custom_class_reference_test.py @@ -48,11 +48,11 @@ def test_adaptation_v2_custom_class_reference(capsys): phrase_set_id = "phrase-set-" + str(uuid4()) custom_class_id = "custom-class-" + str(uuid4()) response = adaptation_v2_custom_class_reference.adaptation_v2_custom_class_reference( - project_id, recognizer_id, phrase_set_id, custom_class_id, os.path.join(RESOURCES, "baby_keem.wav") + project_id, recognizer_id, phrase_set_id, custom_class_id, os.path.join(RESOURCES, "fair.wav") ) assert re.search( - r"play Baby Keem", + r"the word is fare", response.results[0].alternatives[0].transcript, re.DOTALL | re.I, ) diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class.py index 060a0a566dda..3c362fc35c39 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class.py @@ -41,8 +41,8 @@ def adaptation_v2_inline_custom_class(project_id, recognizer_id, audio_file): content = f.read() # Build inline phrase set to produce a more accurate transcript - phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "${keem}", "boost": 20}]) - custom_class = cloud_speech.CustomClass(name="keem", items=[{"value": "Keem"}]) + phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "${fare}", "boost": 20}]) + custom_class = cloud_speech.CustomClass(name="fare", items=[{"value": "fare"}]) adaptation = cloud_speech.SpeechAdaptation( phrase_sets=[ cloud_speech.SpeechAdaptation.AdaptationPhraseSet( diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class_test.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class_test.py index 79cdf78699ef..efa2002f3b6a 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class_test.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_custom_class_test.py @@ -34,11 +34,11 @@ def test_adaptation_v2_inline_custom_class(capsys): recognizer_id = "recognizer-" + str(uuid4()) response = adaptation_v2_inline_custom_class.adaptation_v2_inline_custom_class( - project_id, recognizer_id, os.path.join(RESOURCES, "baby_keem.wav") + project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav") ) assert re.search( - r"play Baby Keem", + r"the word is fare", response.results[0].alternatives[0].transcript, re.DOTALL | re.I, ) diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set.py index de2939b9788f..e6bd581e1317 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set.py @@ -41,7 +41,7 @@ def adaptation_v2_inline_phrase_set(project_id, recognizer_id, audio_file): content = f.read() # Build inline phrase set to produce a more accurate transcript - phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "Keem", "boost": 10}]) + phrase_set = cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}]) adaptation = cloud_speech.SpeechAdaptation( phrase_sets=[ cloud_speech.SpeechAdaptation.AdaptationPhraseSet( diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set_test.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set_test.py index 4254381c3360..ab68d5ae729d 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set_test.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_inline_phrase_set_test.py @@ -34,11 +34,11 @@ def test_adaptation_v2_inline_phrase_set(capsys): recognizer_id = "recognizer-" + str(uuid4()) response = adaptation_v2_inline_phrase_set.adaptation_v2_inline_phrase_set( - project_id, recognizer_id, os.path.join(RESOURCES, "baby_keem.wav") + project_id, recognizer_id, os.path.join(RESOURCES, "fair.wav") ) assert re.search( - r"play Baby Keem", + r"the word is fare", response.results[0].alternatives[0].transcript, re.DOTALL | re.I, ) diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference.py index b89660d21ccd..ceb728557a1c 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference.py @@ -44,7 +44,7 @@ def adaptation_v2_phrase_set_reference(project_id, recognizer_id, phrase_set_id, request = cloud_speech.CreatePhraseSetRequest( parent=f"projects/{project_id}/locations/global", phrase_set_id=phrase_set_id, - phrase_set=cloud_speech.PhraseSet(phrases=[{"value": "Keem", "boost": 10}])) + phrase_set=cloud_speech.PhraseSet(phrases=[{"value": "fare", "boost": 10}])) operation = client.create_phrase_set(request=request) phrase_set = operation.result() diff --git a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference_test.py b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference_test.py index 933d552ad967..cbd1fe5dd5ac 100644 --- a/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference_test.py +++ b/packages/google-cloud-python-speech/samples/snippets/adaptation_v2_phrase_set_reference_test.py @@ -41,11 +41,11 @@ def test_adaptation_v2_phrase_set_reference(capsys): recognizer_id = "recognizer-" + str(uuid4()) phrase_set_id = "phrase-set-" + str(uuid4()) response = adaptation_v2_phrase_set_reference.adaptation_v2_phrase_set_reference( - project_id, recognizer_id, phrase_set_id, os.path.join(RESOURCES, "baby_keem.wav") + project_id, recognizer_id, phrase_set_id, os.path.join(RESOURCES, "fair.wav") ) assert re.search( - r"play Baby Keem", + r"the word is fare", response.results[0].alternatives[0].transcript, re.DOTALL | re.I, ) diff --git a/packages/google-cloud-python-speech/samples/snippets/resources/audio_silence_padding.wav b/packages/google-cloud-python-speech/samples/snippets/resources/audio_silence_padding.wav new file mode 100644 index 000000000000..db883c38634c Binary files /dev/null and b/packages/google-cloud-python-speech/samples/snippets/resources/audio_silence_padding.wav differ diff --git a/packages/google-cloud-python-speech/samples/snippets/resources/baby_keem.wav b/packages/google-cloud-python-speech/samples/snippets/resources/baby_keem.wav deleted file mode 100644 index 4e7a5ca9bcd0..000000000000 Binary files a/packages/google-cloud-python-speech/samples/snippets/resources/baby_keem.wav and /dev/null differ diff --git a/packages/google-cloud-python-speech/samples/snippets/resources/fair.wav b/packages/google-cloud-python-speech/samples/snippets/resources/fair.wav new file mode 100644 index 000000000000..3eb1144f5cbe Binary files /dev/null and b/packages/google-cloud-python-speech/samples/snippets/resources/fair.wav differ diff --git a/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_events.py b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_events.py new file mode 100644 index 000000000000..50689433669a --- /dev/null +++ b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_events.py @@ -0,0 +1,108 @@ +# Copyright 2022 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse + +# [START speech_transcribe_streaming_voice_activity_events] +import io + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + + +def transcribe_streaming_voice_activity_events(project_id, recognizer_id, audio_file): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # In practice, stream should be a generator yielding chunks of audio data + chunk_length = len(content) // 5 + stream = [ + content[start : start + chunk_length] + for start in range(0, len(content), chunk_length) + ] + audio_requests = ( + cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream + ) + + recognition_config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + + # Sets the flag to enable voice activity events + streaming_features = cloud_speech.StreamingRecognitionFeatures( + enable_voice_activity_events=True + ) + streaming_config = cloud_speech.StreamingRecognitionConfig( + config=recognition_config, streaming_features=streaming_features + ) + + config_request = cloud_speech.StreamingRecognizeRequest( + recognizer=recognizer.name, streaming_config=streaming_config + ) + + def requests(config, audio): + yield config + for message in audio: + yield message + + # Transcribes the audio into text + responses_iterator = client.streaming_recognize( + requests=requests(config_request, audio_requests) + ) + responses = [] + for response in responses_iterator: + responses.append(response) + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ): + print("Speech started.") + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END + ): + print("Speech ended.") + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return responses +# [END speech_transcribe_streaming_voice_activity_events] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("project_id", help="project to create recognizer in") + parser.add_argument("recognizer_id", help="name of recognizer to create") + parser.add_argument("audio_file", help="audio file to stream") + args = parser.parse_args() + transcribe_streaming_voice_activity_events( + args.project_id, args.recognizer_id, args.audio_file + ) diff --git a/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_events_test.py b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_events_test.py new file mode 100644 index 000000000000..b73ea128224b --- /dev/null +++ b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_events_test.py @@ -0,0 +1,58 @@ +# Copyright 2022, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import transcribe_streaming_voice_activity_events + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_transcribe_streaming_voice_activity_events(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + responses = transcribe_streaming_voice_activity_events.transcribe_streaming_voice_activity_events( + project_id, recognizer_id, os.path.join(RESOURCES, "audio.wav") + ) + + transcript = "" + for response in responses: + for result in response.results: + transcript += result.alternatives[0].transcript + + assert ( + responses[0].speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ) + + assert re.search( + r"how old is the Brooklyn Bridge", + transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) diff --git a/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_timeouts.py b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_timeouts.py new file mode 100644 index 000000000000..6b6bdfef03b0 --- /dev/null +++ b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_timeouts.py @@ -0,0 +1,131 @@ +# Copyright 2022 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse + +# [START speech_transcribe_streaming_voice_activity_timeouts] +import io +from time import sleep + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech +from google.protobuf import duration_pb2 # type: ignore + + +def transcribe_streaming_voice_activity_timeouts( + project_id, recognizer_id, speech_start_timeout, speech_end_timeout, audio_file +): + # Instantiates a client + client = SpeechClient() + + request = cloud_speech.CreateRecognizerRequest( + parent=f"projects/{project_id}/locations/global", + recognizer_id=recognizer_id, + recognizer=cloud_speech.Recognizer( + language_codes=["en-US"], model="latest_long" + ), + ) + + # Creates a Recognizer + operation = client.create_recognizer(request=request) + recognizer = operation.result() + + # Reads a file as bytes + with io.open(audio_file, "rb") as f: + content = f.read() + + # In practice, stream should be a generator yielding chunks of audio data + chunk_length = len(content) // 20 + stream = [ + content[start : start + chunk_length] + for start in range(0, len(content), chunk_length) + ] + audio_requests = ( + cloud_speech.StreamingRecognizeRequest(audio=audio) for audio in stream + ) + + recognition_config = cloud_speech.RecognitionConfig(auto_decoding_config={}) + + # Sets the flag to enable voice activity events and timeout + speech_start_timeout = duration_pb2.Duration(seconds=speech_start_timeout) + speech_end_timeout = duration_pb2.Duration(seconds=speech_end_timeout) + voice_activity_timeout = ( + cloud_speech.StreamingRecognitionFeatures.VoiceActivityTimeout( + speech_start_timeout=speech_start_timeout, + speech_end_timeout=speech_end_timeout, + ) + ) + streaming_features = cloud_speech.StreamingRecognitionFeatures( + enable_voice_activity_events=True, voice_activity_timeout=voice_activity_timeout + ) + + streaming_config = cloud_speech.StreamingRecognitionConfig( + config=recognition_config, streaming_features=streaming_features + ) + + config_request = cloud_speech.StreamingRecognizeRequest( + recognizer=recognizer.name, streaming_config=streaming_config + ) + + def requests(config, audio): + yield config + for message in audio: + sleep(0.5) + yield message + + # Transcribes the audio into text + responses_iterator = client.streaming_recognize( + requests=requests(config_request, audio_requests) + ) + + responses = [] + for response in responses_iterator: + responses.append(response) + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ): + print("Speech started.") + if ( + response.speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END + ): + print("Speech ended.") + for result in response.results: + print("Transcript: {}".format(result.alternatives[0].transcript)) + + return responses +# [END speech_transcribe_streaming_voice_activity_timeouts] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("project_id", help="project to create recognizer in") + parser.add_argument("recognizer_id", help="name of recognizer to create") + parser.add_argument( + "speech_start_timeout", help="timeout in seconds for speech start" + ) + parser.add_argument("speech_end_timeout", help="timeout in seconds for speech end") + parser.add_argument("audio_file", help="audio file to stream") + args = parser.parse_args() + transcribe_streaming_voice_activity_timeouts( + args.project_id, + args.recognizer_id, + args.speech_start_timeout, + args.speech_end_timeout, + args.audio_file, + ) diff --git a/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_timeouts_test.py b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_timeouts_test.py new file mode 100644 index 000000000000..3069c4a9bd57 --- /dev/null +++ b/packages/google-cloud-python-speech/samples/snippets/transcribe_streaming_voice_activity_timeouts_test.py @@ -0,0 +1,80 @@ +# Copyright 2022, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from uuid import uuid4 + +from google.cloud.speech_v2 import SpeechClient +from google.cloud.speech_v2.types import cloud_speech + +import transcribe_streaming_voice_activity_timeouts + +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") + + +def delete_recognizer(name): + client = SpeechClient() + request = cloud_speech.DeleteRecognizerRequest(name=name) + client.delete_recognizer(request=request) + + +def test_transcribe_streaming_voice_activity_timeouts(capsys): + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + + recognizer_id = "recognizer-" + str(uuid4()) + responses = transcribe_streaming_voice_activity_timeouts.transcribe_streaming_voice_activity_timeouts( + project_id, + recognizer_id, + 1, + 5, + os.path.join(RESOURCES, "audio_silence_padding.wav"), + ) + + assert len(responses) == 0 + + recognizer_id_2 = "recognizer-2-" + str(uuid4()) + responses = transcribe_streaming_voice_activity_timeouts.transcribe_streaming_voice_activity_timeouts( + project_id, + recognizer_id_2, + 5, + 1, + os.path.join(RESOURCES, "audio_silence_padding.wav"), + ) + transcript = "" + for response in responses: + for result in response.results: + transcript += result.alternatives[0].transcript + + assert ( + responses[0].speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN + ) + + assert ( + responses[1].speech_event_type + == cloud_speech.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_END + ) + + assert re.search( + r"how old is the Brooklyn Bridge", + transcript, + re.DOTALL | re.I, + ) + + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id}" + ) + delete_recognizer( + f"projects/{project_id}/locations/global/recognizers/{recognizer_id_2}" + )