Merge pull request #30 from KeneNwogu/main

fix: speech synthesis integration into streamlit runner
mlsanigeria · Oct 15, 2024 · 19c1e63 · 19c1e63
2 parents 2fcb821 + 01704e6
commit 19c1e63
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 15 deletions.
diff --git a/main.py b/main.py
@@ -2,7 +2,7 @@
 import os
 import logging
 from dotenv import load_dotenv
-from src.speech_io import transcribe_audio
+from src.speech_io import transcribe_audio, synthesize_speech
 from src.rag_functions import allowed_files, file_check_num, extract_contents_from_doc
 from langchain.chains import RetrievalQA
 from langchain.chat_models import ChatOpenAI
@@ -95,42 +95,62 @@ def get_llm() -> ChatOpenAI:
     else:
         st.session_state.uploaded_files = None
 
+
+def send_response(message, response=None):
+    dummy_response = "Hello. How are you?"
+    st.session_state.messages.append(('assistant', response or dummy_response))
+    # TODO: make async ??
+    print(response or dummy_response)
+    synthesize_speech(text=response or dummy_response)
+
+
 # Chat area and audio input handling
 def send_message():
     prompt = st.session_state.prompt
     st.session_state.messages.append(('user', prompt))
+
+    # get response turn it to speech and reply user
+    send_response(prompt)
+
 
 if 'messages' not in st.session_state:
     st.session_state.messages = []
 
 message = st.container()
 
 # Handle text input from user
-if prompt := st.chat_input("Enter your query"):
-    message.chat_message("user").write(prompt)
+# if prompt := st.chat_input("Enter your query"):
+#     message.chat_message("user").write(prompt)
 
-# Handle audio input from user
-audio_value = st.experimental_audio_input("Record a voice message")
-if audio_value:
+def handle_audio_message():
+    audio_value = st.session_state.audio_prompt
     try:
         with open("audio.wav", "wb") as f:
             f.write(audio_value.getbuffer())
 
         speech_text = transcribe_audio("audio.wav")
         if speech_text:
-            message.chat_message("user").write(speech_text)
+            st.session_state.messages.append(("user", speech_text))
+            send_response(speech_text, "You have a great voice")
             logging.info("Audio transcribed successfully.")
         else:
-            message.chat_message("user").write("Sorry, I couldn't transcribe your audio. Please try again.")
+            # st.session_state.messages.append(("assistant", ))
+            send_response(speech_text, "Sorry, I couldn't transcribe your audio. Please try again.")
             logging.warning("Audio transcription failed.")
     except Exception as e:
         st.error("An error occurred while processing the audio. Please try again.")
-        logging.error(f"Error processing audio input: {e}")
+        logging.error(f"Error processing audio input: {e}")  
+
 
 # Input area for user queries
 st.chat_input("Enter your query", key='prompt', on_submit=send_message)
 
 # Display chat messages
-with message:
-    for role, text in st.session_state.messages:
-        st.chat_message(role).write(text)
+
+# with message:
+for role, text in st.session_state.messages:
+    st.chat_message(role).write(text)
+
+
+# Handle audio input from user
+audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message)
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,8 @@ altair==5.4.1
 annotated-types==0.7.0
 anyio==4.6.0
 attrs==24.2.0
+azure-ai-formrecognizer==3.3.3 
+azure-identity==1.19.0
 azure-cognitiveservices-speech==1.40.0
 blinker==1.8.2
 cachetools==5.5.0
@@ -68,4 +70,5 @@ tqdm==4.66.5
 typing_extensions==4.12.2
 tzdata==2024.2
 urllib3==2.2.3
-yarl==1.12.1
+yarl==1.12.1
+werkzeug==3.0.4
diff --git a/src/speech_io.py b/src/speech_io.py
@@ -77,16 +77,19 @@ def synthesize_speech(text, output_file="output.wav", voice_name='en-NG-EzinneNe
     if not SPEECH_KEY or not SPEECH_REGION:
         return False, "Azure Speech Service credentials are missing."
 
+    output = open(output_file, 'w+')
+    output.close()
+
     try:
         # Configure speech service
         speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
-        audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
+        # audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
 
         # Set the voice for synthesis
         speech_config.speech_synthesis_voice_name = voice_name
 
         # Create synthesizer and generate speech
-        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
+        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
         result = speech_synthesizer.speak_text_async(text).get()
 
         # Handle the result