diff --git a/.gitignore b/.gitignore index fb76e8d..cf74fba 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ pyvenv.cfg *__pycache__/ *.DS_Store *.venv/ -*.wav \ No newline at end of file +*.wav +speech_outputs/ \ No newline at end of file diff --git a/main.py b/main.py index 98eea8d..0094f98 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,7 @@ from langchain.chat_models import ChatOpenAI from langchain.embeddings import OpenAIEmbeddings import openai +import uuid # Set up page configuration st.set_page_config(page_title="Speak-To-Docs", page_icon="📝", layout="wide", initial_sidebar_state="expanded") @@ -94,14 +95,21 @@ def get_llm() -> ChatOpenAI: logging.error(f"Error extracting content from document: {e}") else: st.session_state.uploaded_files = None - + + st.subheader("Speech output responses") + if 'speech_outputs' in st.session_state: + for speech_output in st.session_state.speech_outputs: + st.audio(os.path.join('speech_outputs', speech_output), format="audio/wav", start_time=0) def send_response(message, response=None): dummy_response = "Hello. How are you?" st.session_state.messages.append(('assistant', response or dummy_response)) + # TODO: make async ?? - print(response or dummy_response) - synthesize_speech(text=response or dummy_response) + # generate unique file name + output_file = uuid.uuid4().hex + ".wav" + synthesize_speech(output_file=output_file, text=response or dummy_response) + st.session_state.speech_outputs.append(output_file) # Chat area and audio input handling @@ -115,6 +123,9 @@ def send_message(): if 'messages' not in st.session_state: st.session_state.messages = [] + +if 'speech_outputs' not in st.session_state: + st.session_state.speech_outputs = [] message = st.container() diff --git a/src/speech_io.py b/src/speech_io.py index 6c2316e..720422f 100644 --- a/src/speech_io.py +++ b/src/speech_io.py @@ -76,20 +76,24 @@ def synthesize_speech(text, output_file="output.wav", voice_name='en-NG-EzinneNe """ if not SPEECH_KEY or not SPEECH_REGION: return False, "Azure Speech Service credentials are missing." + + path = "speech_outputs" + os.makedirs(path, exist_ok=True) + output_file = os.path.join(path, output_file) output = open(output_file, 'w+') output.close() try: # Configure speech service - speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION) - # audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file) + speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION) + audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file) # Set the voice for synthesis speech_config.speech_synthesis_voice_name = voice_name # Create synthesizer and generate speech - speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config) + speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) result = speech_synthesizer.speak_text_async(text).get() # Handle the result