From bd826c9238ea7cba63ac7ea0db043fefd8726f75 Mon Sep 17 00:00:00 2001
From: Odeyiany2 <miriamodeyianypeter@gmail.com>
Date: Fri, 18 Oct 2024 21:41:01 +0100
Subject: [PATCH 1/6] Added functions to Embed document Chunks

---
 CONTRIBUTING.md      |  1 +
 main.py              | 98 ++++++++++++++++++++++++++++++++++++--------
 src/rag_functions.py | 46 +++++++++++++++++++++
 3 files changed, 128 insertions(+), 17 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 425e736..ea77a55 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -112,3 +112,4 @@ The project is structured as follows:
 7. [Azure OpenAI Models: Deployment](https://learn.microsoft.com/azure/ai-services/openai/how-to/working-with-models?tabs=powershell?wt.mc_id=studentamb_405806)
 8. [Azure Speech Service documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/?wt.mc_id=studentamb_217190)
 9. [Develop Generative AI solutions with Azure OpenAI Service](https://learn.microsoft.com/en-us/training/paths/develop-ai-solutions-azure-openai/?wt.mc_id=studentamb_217190)
+10. [Langchain's DocArrayInMemoryStore Documentation](https://python.langchain.com/docs/integrations/vectorstores/docarray_in_memory/)
diff --git a/main.py b/main.py
index 98eea8d..71af378 100644
--- a/main.py
+++ b/main.py
@@ -3,10 +3,14 @@
 import logging
 from dotenv import load_dotenv
 from src.speech_io import transcribe_audio, synthesize_speech
-from src.rag_functions import allowed_files, file_check_num, extract_contents_from_doc
-from langchain.chains import RetrievalQA
+from src.rag_functions import (allowed_files, file_check_num, 
+                               extract_contents_from_doc, create_vector_store, logger)
+from langchain.chains.retrieval import create_retrieval_chain
+from langchain.chains.history_aware_retriever import create_history_aware_retriever
 from langchain.chat_models import ChatOpenAI
 from langchain.embeddings import OpenAIEmbeddings
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import PromptTemplate
 import openai
 
 # Set up page configuration
@@ -28,13 +32,13 @@ def get_llm() -> ChatOpenAI:
         openai.api_type = "azure"
         openai.api_version = os.getenv("OPENAI_API_VERSION")
         
-        # OpenAI Settings
-        openai_embeddings = OpenAIEmbeddings(
-            openai_api_version=os.getenv("OPENAI_API_VERSION"), 
-            openai_api_key=os.getenv("API_KEY"),
-            openai_api_base=os.getenv("ENDPOINT"), 
-            openai_api_type="azure"
-        )
+        # # OpenAI Settings
+        # openai_embeddings = OpenAIEmbeddings(
+        #     openai_api_version=os.getenv("OPENAI_API_VERSION"), 
+        #     openai_api_key=os.getenv("API_KEY"),
+        #     openai_api_base=os.getenv("ENDPOINT"), 
+        #     openai_api_type="azure"
+        # )
         
         llm = ChatOpenAI(
             temperature=0.3, openai_api_key=os.getenv("API_KEY"), 
@@ -50,6 +54,54 @@ def get_llm() -> ChatOpenAI:
 
 llm = get_llm()
 
+    
+def query_response(query, vector_store):
+    """
+    Generates a response to the user's query using the vector store and the language model.
+    Uses Langchains retrieval library
+
+    Args:
+        query (str): The user's input query.
+        vector_store (DocArrayInMemorySearch): The initialized vector store
+
+    Returns:
+        str: The generated response.
+    """
+    try:
+        llm = get_llm()
+        #prompting for the llm 
+        prompt_template = """Use the following excerpts to answer a query. If you can't find the answer from the provided document,
+            don't try to make up an answer. Just say "I can't find the answer from the provided document but you may want to check the following links".
+
+    Context: {context}
+
+    Question: {question}
+
+    Helpful Answer:
+    """
+        qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+        # Create history-aware retriever
+        history_aware_retriever = create_history_aware_retriever(
+            llm,
+            vector_store.as_retriever(search_type="similarity",
+                                    search_kwargs={"k": 3},),
+            qa_prompt,)
+        #initializing  a question answer chain
+        question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
+
+        #query retrieval chain
+        chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
+
+        #retrieve answer
+        response = chain.invoke({"question": query})
+        logger.info("Response successfully generated")
+        return response["answer"]
+
+    except Exception as e:
+        logger.error(f"Error occurred in generating response:{e}")
+        return "Sorry, I couldn't process your request at the moment."
+
+
 # Sidebar configuration for file uploads
 if 'uploaded_files' not in st.session_state:
     st.session_state.uploaded_files = None
@@ -87,8 +139,11 @@ def get_llm() -> ChatOpenAI:
             if valid_file and valid_files:
                 try:
                     extraction_results = extract_contents_from_doc(valid_files, "temp_dir")
-                    st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.")
-                    logging.info("File(s) uploaded and processed successfully.")
+                    vector_store = create_vector_store(extraction_results)
+                    if vector_store:
+                        st.session_state['vector_store'] = vector_store
+                        st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.")
+                        logging.info("File(s) uploaded and processed successfully.")
                 except Exception as e:
                     st.error("An error occurred while processing your document. Please try again.")
                     logging.error(f"Error extracting content from document: {e}")
@@ -97,11 +152,19 @@ def get_llm() -> ChatOpenAI:
 
 
 def send_response(message, response=None):
-    dummy_response = "Hello. How are you?"
-    st.session_state.messages.append(('assistant', response or dummy_response))
+    # dummy_response = "Hello. How are you?"
+    # st.session_state.messages.append(('assistant', response or dummy_response))
     # TODO: make async ??
-    print(response or dummy_response)
-    synthesize_speech(text=response or dummy_response)
+    vector_store = st.session_state['vector_store']
+
+    # Get the response from query_response
+    answer = query_response(message, vector_store)
+
+    # Append the assistant's response to messages
+    st.session_state.messages.append(('assistant', answer))
+
+    print(answer)
+    synthesize_speech(text=answer)
     
 
 # Chat area and audio input handling
@@ -131,8 +194,9 @@ def handle_audio_message():
         speech_text = transcribe_audio("audio.wav")
         if speech_text:
             st.session_state.messages.append(("user", speech_text))
-            send_response(speech_text, "You have a great voice")
-            logging.info("Audio transcribed successfully.")
+            #send_response(speech_text, "You have a great voice")
+            send_response(speech_text)
+            logging.info("Audio transcribed and response generated successfully.")
         else:
             # st.session_state.messages.append(("assistant", ))
             send_response(speech_text, "Sorry, I couldn't transcribe your audio. Please try again.")
diff --git a/src/rag_functions.py b/src/rag_functions.py
index 89207c0..f4ec846 100644
--- a/src/rag_functions.py
+++ b/src/rag_functions.py
@@ -8,6 +8,10 @@
 from azure.ai.formrecognizer import DocumentAnalysisClient
 from azure.core.credentials import AzureKeyCredential
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
+from langchain_community.vectorstores import DocArrayInMemorySearch
+from langchain.schema import Document
+
 
 # Setting up logging
 logging.basicConfig(level=logging.INFO)
@@ -129,3 +133,45 @@ def extract_contents_from_doc(files, temp_dir):
             continue  # Proceed with the next file in case of an error
 
     return extracted_file_paths
+
+#function to embed the chunks created on docs and initializing a vector store
+def create_vector_store(extracted_file_paths):
+    """
+    Embeds the documents and initializes a DocArrayInMemorySearch vector store.
+
+    Args:
+        extracted_file_path: A path containing the contents extracted from the documents uploaded
+
+    Returns:
+        DocArrayInMemorySearch: An initialized vector store with embedded documents.
+    """
+    try:
+        #OpenAI Embedding settings
+        openai_embeddings = OpenAIEmbeddings(
+                openai_api_version=os.getenv("OPENAI_API_VERSION"), 
+                openai_api_key=os.getenv("API_KEY"),
+                openai_api_base=os.getenv("ENDPOINT"), 
+                openai_api_type="azure",
+                deployment="text-embedding-ada-002"
+            )
+        logger.info("OpenAI Embeddings initialized successfully.")
+        docs = []
+        for file_path in extracted_file_paths:
+            try:
+                with open(file_path, "r", encoding="utf-8") as file:
+                    text = file.read()
+                chunks = chunk_document(text)
+                docs.extend([Document(page_content=chunk) for chunk in chunks])
+                logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.")
+            except Exception as e:
+                logger.error(f"Error reading or chunking file '{file_path}': {e}")
+                continue
+
+        #initializing the vector store
+        vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings)
+        logger.info("DocArrayInMemorySearch vector store initialized successfully.")
+
+        return vector_store
+    
+    except Exception as e:
+        logger.exception(f"An error occurred while initializing the vector store: {e}")

From 4b746454f4462f459b930bcac06ceb24b106e971 Mon Sep 17 00:00:00 2001
From: Odeyiany2 <miriamodeyianypeter@gmail.com>
Date: Sat, 19 Oct 2024 08:42:12 +0100
Subject: [PATCH 2/6] Added a function for DocArrayInMemorySearch

---
 main.py              | 99 ++++++++++++++++++++------------------------
 src/rag_functions.py | 45 --------------------
 2 files changed, 44 insertions(+), 100 deletions(-)

diff --git a/main.py b/main.py
index 71af378..839a37b 100644
--- a/main.py
+++ b/main.py
@@ -4,7 +4,10 @@
 from dotenv import load_dotenv
 from src.speech_io import transcribe_audio, synthesize_speech
 from src.rag_functions import (allowed_files, file_check_num, 
-                               extract_contents_from_doc, create_vector_store, logger)
+                               extract_contents_from_doc, chunk_document, logger)
+from langchain.embeddings import OpenAIEmbeddings
+from langchain_community.vectorstores import DocArrayInMemorySearch
+from langchain.schema import Document
 from langchain.chains.retrieval import create_retrieval_chain
 from langchain.chains.history_aware_retriever import create_history_aware_retriever
 from langchain.chat_models import ChatOpenAI
@@ -54,52 +57,47 @@ def get_llm() -> ChatOpenAI:
 
 llm = get_llm()
 
-    
-def query_response(query, vector_store):
+#function to embed the chunks created on docs and initializing a vector store
+def create_vector_store(extracted_file_paths):
     """
-    Generates a response to the user's query using the vector store and the language model.
-    Uses Langchains retrieval library
+    Embeds the documents and initializes a DocArrayInMemorySearch vector store.
 
     Args:
-        query (str): The user's input query.
-        vector_store (DocArrayInMemorySearch): The initialized vector store
+        extracted_file_path: A path containing the contents extracted from the documents uploaded
 
     Returns:
-        str: The generated response.
+        DocArrayInMemorySearch: An initialized vector store with embedded documents.
     """
     try:
-        llm = get_llm()
-        #prompting for the llm 
-        prompt_template = """Use the following excerpts to answer a query. If you can't find the answer from the provided document,
-            don't try to make up an answer. Just say "I can't find the answer from the provided document but you may want to check the following links".
-
-    Context: {context}
-
-    Question: {question}
-
-    Helpful Answer:
-    """
-        qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
-        # Create history-aware retriever
-        history_aware_retriever = create_history_aware_retriever(
-            llm,
-            vector_store.as_retriever(search_type="similarity",
-                                    search_kwargs={"k": 3},),
-            qa_prompt,)
-        #initializing  a question answer chain
-        question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
-
-        #query retrieval chain
-        chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
-
-        #retrieve answer
-        response = chain.invoke({"question": query})
-        logger.info("Response successfully generated")
-        return response["answer"]
-
+        #OpenAI Embedding settings
+        openai_embeddings = OpenAIEmbeddings(
+                openai_api_version=os.getenv("OPENAI_API_VERSION"), 
+                openai_api_key=os.getenv("API_KEY"),
+                openai_api_base=os.getenv("ENDPOINT"), 
+                openai_api_type="azure",
+                deployment="text-embedding-ada-002"
+            )
+        logger.info("OpenAI Embeddings initialized successfully.")
+        docs = []
+        for file_path in extracted_file_paths:
+            try:
+                with open(file_path, "r", encoding="utf-8") as file:
+                    text = file.read()
+                chunks = chunk_document(text)
+                docs.extend([Document(page_content=chunk) for chunk in chunks])
+                logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.")
+            except Exception as e:
+                logger.error(f"Error reading or chunking file '{file_path}': {e}")
+                continue
+
+        #initializing the vector store
+        vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings)
+        logger.info("DocArrayInMemorySearch vector store initialized successfully.")
+
+        return vector_store
+    
     except Exception as e:
-        logger.error(f"Error occurred in generating response:{e}")
-        return "Sorry, I couldn't process your request at the moment."
+        logger.exception(f"An error occurred while initializing the vector store: {e}")
 
 
 # Sidebar configuration for file uploads
@@ -152,19 +150,11 @@ def query_response(query, vector_store):
 
 
 def send_response(message, response=None):
-    # dummy_response = "Hello. How are you?"
-    # st.session_state.messages.append(('assistant', response or dummy_response))
+    dummy_response = "Hello. How are you?"
+    st.session_state.messages.append(('assistant', response or dummy_response))
     # TODO: make async ??
-    vector_store = st.session_state['vector_store']
-
-    # Get the response from query_response
-    answer = query_response(message, vector_store)
-
-    # Append the assistant's response to messages
-    st.session_state.messages.append(('assistant', answer))
-
-    print(answer)
-    synthesize_speech(text=answer)
+    print(response or dummy_response)
+    synthesize_speech(text=response or dummy_response)
     
 
 # Chat area and audio input handling
@@ -194,9 +184,8 @@ def handle_audio_message():
         speech_text = transcribe_audio("audio.wav")
         if speech_text:
             st.session_state.messages.append(("user", speech_text))
-            #send_response(speech_text, "You have a great voice")
-            send_response(speech_text)
-            logging.info("Audio transcribed and response generated successfully.")
+            send_response(speech_text, "You have a great voice")
+            logging.info("Audio transcribed successfully.")
         else:
             # st.session_state.messages.append(("assistant", ))
             send_response(speech_text, "Sorry, I couldn't transcribe your audio. Please try again.")
@@ -217,4 +206,4 @@ def handle_audio_message():
     
 
 # Handle audio input from user
-audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message)
+audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message)
\ No newline at end of file
diff --git a/src/rag_functions.py b/src/rag_functions.py
index f4ec846..9c02683 100644
--- a/src/rag_functions.py
+++ b/src/rag_functions.py
@@ -8,9 +8,6 @@
 from azure.ai.formrecognizer import DocumentAnalysisClient
 from azure.core.credentials import AzureKeyCredential
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import OpenAIEmbeddings
-from langchain_community.vectorstores import DocArrayInMemorySearch
-from langchain.schema import Document
 
 
 # Setting up logging
@@ -133,45 +130,3 @@ def extract_contents_from_doc(files, temp_dir):
             continue  # Proceed with the next file in case of an error
 
     return extracted_file_paths
-
-#function to embed the chunks created on docs and initializing a vector store
-def create_vector_store(extracted_file_paths):
-    """
-    Embeds the documents and initializes a DocArrayInMemorySearch vector store.
-
-    Args:
-        extracted_file_path: A path containing the contents extracted from the documents uploaded
-
-    Returns:
-        DocArrayInMemorySearch: An initialized vector store with embedded documents.
-    """
-    try:
-        #OpenAI Embedding settings
-        openai_embeddings = OpenAIEmbeddings(
-                openai_api_version=os.getenv("OPENAI_API_VERSION"), 
-                openai_api_key=os.getenv("API_KEY"),
-                openai_api_base=os.getenv("ENDPOINT"), 
-                openai_api_type="azure",
-                deployment="text-embedding-ada-002"
-            )
-        logger.info("OpenAI Embeddings initialized successfully.")
-        docs = []
-        for file_path in extracted_file_paths:
-            try:
-                with open(file_path, "r", encoding="utf-8") as file:
-                    text = file.read()
-                chunks = chunk_document(text)
-                docs.extend([Document(page_content=chunk) for chunk in chunks])
-                logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.")
-            except Exception as e:
-                logger.error(f"Error reading or chunking file '{file_path}': {e}")
-                continue
-
-        #initializing the vector store
-        vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings)
-        logger.info("DocArrayInMemorySearch vector store initialized successfully.")
-
-        return vector_store
-    
-    except Exception as e:
-        logger.exception(f"An error occurred while initializing the vector store: {e}")

From 3d42cf1906cbf3d06e6b26b055eb6c4be861df67 Mon Sep 17 00:00:00 2001
From: Odeyiany2 <miriamodeyianypeter@gmail.com>
Date: Sat, 19 Oct 2024 08:44:42 +0100
Subject: [PATCH 3/6] Added a function for DocArrayInMemorySearch

---
 main.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/main.py b/main.py
index 839a37b..21eb8c6 100644
--- a/main.py
+++ b/main.py
@@ -8,12 +8,8 @@
 from langchain.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import DocArrayInMemorySearch
 from langchain.schema import Document
-from langchain.chains.retrieval import create_retrieval_chain
-from langchain.chains.history_aware_retriever import create_history_aware_retriever
 from langchain.chat_models import ChatOpenAI
 from langchain.embeddings import OpenAIEmbeddings
-from langchain.chains.combine_documents import create_stuff_documents_chain
-from langchain_core.prompts import PromptTemplate
 import openai
 
 # Set up page configuration

From d64af354f1bfd459a751fc456b382524885ddd11 Mon Sep 17 00:00:00 2001
From: Samuel Bamgbola <samuelbamgbola@gmail.com>
Date: Sat, 19 Oct 2024 20:10:00 +0100
Subject: [PATCH 4/6] Removed redundant newline

---
 src/rag_functions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/rag_functions.py b/src/rag_functions.py
index 9c02683..89207c0 100644
--- a/src/rag_functions.py
+++ b/src/rag_functions.py
@@ -9,7 +9,6 @@
 from azure.core.credentials import AzureKeyCredential
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
-
 # Setting up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

From 15fbade4439d5a276f5cb72e6b8ba9b82599bf7e Mon Sep 17 00:00:00 2001
From: Odeyiany2 <miriamodeyianypeter@gmail.com>
Date: Mon, 21 Oct 2024 14:16:24 +0100
Subject: [PATCH 5/6] Updated Azure OpenAI Embedding Model

---
 main.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index 21eb8c6..25a66f7 100644
--- a/main.py
+++ b/main.py
@@ -5,7 +5,7 @@
 from src.speech_io import transcribe_audio, synthesize_speech
 from src.rag_functions import (allowed_files, file_check_num, 
                                extract_contents_from_doc, chunk_document, logger)
-from langchain.embeddings import OpenAIEmbeddings
+from langchain.embeddings import OpenAIEmbeddings, AzureOpenAIEmbeddings
 from langchain_community.vectorstores import DocArrayInMemorySearch
 from langchain.schema import Document
 from langchain.chat_models import ChatOpenAI
@@ -66,8 +66,10 @@ def create_vector_store(extracted_file_paths):
     """
     try:
         #OpenAI Embedding settings
-        openai_embeddings = OpenAIEmbeddings(
+        openai_embeddings = AzureOpenAIEmbeddings(
                 openai_api_version=os.getenv("OPENAI_API_VERSION"), 
+                chunk_size= 1024,
+                validate_base_url=True, # Explicitly provide validate_base_url
                 openai_api_key=os.getenv("API_KEY"),
                 openai_api_base=os.getenv("ENDPOINT"), 
                 openai_api_type="azure",

From a5999f307ad9efb789e22845b1ed681b20182545 Mon Sep 17 00:00:00 2001
From: Odeyiany2 <miriamodeyianypeter@gmail.com>
Date: Mon, 21 Oct 2024 17:04:14 +0100
Subject: [PATCH 6/6] Updated Function DocArrayInMemorySearch

---
 main.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 25a66f7..21eb8c6 100644
--- a/main.py
+++ b/main.py
@@ -5,7 +5,7 @@
 from src.speech_io import transcribe_audio, synthesize_speech
 from src.rag_functions import (allowed_files, file_check_num, 
                                extract_contents_from_doc, chunk_document, logger)
-from langchain.embeddings import OpenAIEmbeddings, AzureOpenAIEmbeddings
+from langchain.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import DocArrayInMemorySearch
 from langchain.schema import Document
 from langchain.chat_models import ChatOpenAI
@@ -66,10 +66,8 @@ def create_vector_store(extracted_file_paths):
     """
     try:
         #OpenAI Embedding settings
-        openai_embeddings = AzureOpenAIEmbeddings(
+        openai_embeddings = OpenAIEmbeddings(
                 openai_api_version=os.getenv("OPENAI_API_VERSION"), 
-                chunk_size= 1024,
-                validate_base_url=True, # Explicitly provide validate_base_url
                 openai_api_key=os.getenv("API_KEY"),
                 openai_api_base=os.getenv("ENDPOINT"), 
                 openai_api_type="azure",