From bd826c9238ea7cba63ac7ea0db043fefd8726f75 Mon Sep 17 00:00:00 2001 From: Odeyiany2 Date: Fri, 18 Oct 2024 21:41:01 +0100 Subject: [PATCH 1/6] Added functions to Embed document Chunks --- CONTRIBUTING.md | 1 + main.py | 98 ++++++++++++++++++++++++++++++++++++-------- src/rag_functions.py | 46 +++++++++++++++++++++ 3 files changed, 128 insertions(+), 17 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 425e736..ea77a55 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -112,3 +112,4 @@ The project is structured as follows: 7. [Azure OpenAI Models: Deployment](https://learn.microsoft.com/azure/ai-services/openai/how-to/working-with-models?tabs=powershell?wt.mc_id=studentamb_405806) 8. [Azure Speech Service documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/?wt.mc_id=studentamb_217190) 9. [Develop Generative AI solutions with Azure OpenAI Service](https://learn.microsoft.com/en-us/training/paths/develop-ai-solutions-azure-openai/?wt.mc_id=studentamb_217190) +10. [Langchain's DocArrayInMemoryStore Documentation](https://python.langchain.com/docs/integrations/vectorstores/docarray_in_memory/) diff --git a/main.py b/main.py index 98eea8d..71af378 100644 --- a/main.py +++ b/main.py @@ -3,10 +3,14 @@ import logging from dotenv import load_dotenv from src.speech_io import transcribe_audio, synthesize_speech -from src.rag_functions import allowed_files, file_check_num, extract_contents_from_doc -from langchain.chains import RetrievalQA +from src.rag_functions import (allowed_files, file_check_num, + extract_contents_from_doc, create_vector_store, logger) +from langchain.chains.retrieval import create_retrieval_chain +from langchain.chains.history_aware_retriever import create_history_aware_retriever from langchain.chat_models import ChatOpenAI from langchain.embeddings import OpenAIEmbeddings +from langchain.chains.combine_documents import create_stuff_documents_chain +from langchain_core.prompts import PromptTemplate import openai # Set up page configuration @@ -28,13 +32,13 @@ def get_llm() -> ChatOpenAI: openai.api_type = "azure" openai.api_version = os.getenv("OPENAI_API_VERSION") - # OpenAI Settings - openai_embeddings = OpenAIEmbeddings( - openai_api_version=os.getenv("OPENAI_API_VERSION"), - openai_api_key=os.getenv("API_KEY"), - openai_api_base=os.getenv("ENDPOINT"), - openai_api_type="azure" - ) + # # OpenAI Settings + # openai_embeddings = OpenAIEmbeddings( + # openai_api_version=os.getenv("OPENAI_API_VERSION"), + # openai_api_key=os.getenv("API_KEY"), + # openai_api_base=os.getenv("ENDPOINT"), + # openai_api_type="azure" + # ) llm = ChatOpenAI( temperature=0.3, openai_api_key=os.getenv("API_KEY"), @@ -50,6 +54,54 @@ def get_llm() -> ChatOpenAI: llm = get_llm() + +def query_response(query, vector_store): + """ + Generates a response to the user's query using the vector store and the language model. + Uses Langchains retrieval library + + Args: + query (str): The user's input query. + vector_store (DocArrayInMemorySearch): The initialized vector store + + Returns: + str: The generated response. + """ + try: + llm = get_llm() + #prompting for the llm + prompt_template = """Use the following excerpts to answer a query. If you can't find the answer from the provided document, + don't try to make up an answer. Just say "I can't find the answer from the provided document but you may want to check the following links". + + Context: {context} + + Question: {question} + + Helpful Answer: + """ + qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) + # Create history-aware retriever + history_aware_retriever = create_history_aware_retriever( + llm, + vector_store.as_retriever(search_type="similarity", + search_kwargs={"k": 3},), + qa_prompt,) + #initializing a question answer chain + question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) + + #query retrieval chain + chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) + + #retrieve answer + response = chain.invoke({"question": query}) + logger.info("Response successfully generated") + return response["answer"] + + except Exception as e: + logger.error(f"Error occurred in generating response:{e}") + return "Sorry, I couldn't process your request at the moment." + + # Sidebar configuration for file uploads if 'uploaded_files' not in st.session_state: st.session_state.uploaded_files = None @@ -87,8 +139,11 @@ def get_llm() -> ChatOpenAI: if valid_file and valid_files: try: extraction_results = extract_contents_from_doc(valid_files, "temp_dir") - st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.") - logging.info("File(s) uploaded and processed successfully.") + vector_store = create_vector_store(extraction_results) + if vector_store: + st.session_state['vector_store'] = vector_store + st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.") + logging.info("File(s) uploaded and processed successfully.") except Exception as e: st.error("An error occurred while processing your document. Please try again.") logging.error(f"Error extracting content from document: {e}") @@ -97,11 +152,19 @@ def get_llm() -> ChatOpenAI: def send_response(message, response=None): - dummy_response = "Hello. How are you?" - st.session_state.messages.append(('assistant', response or dummy_response)) + # dummy_response = "Hello. How are you?" + # st.session_state.messages.append(('assistant', response or dummy_response)) # TODO: make async ?? - print(response or dummy_response) - synthesize_speech(text=response or dummy_response) + vector_store = st.session_state['vector_store'] + + # Get the response from query_response + answer = query_response(message, vector_store) + + # Append the assistant's response to messages + st.session_state.messages.append(('assistant', answer)) + + print(answer) + synthesize_speech(text=answer) # Chat area and audio input handling @@ -131,8 +194,9 @@ def handle_audio_message(): speech_text = transcribe_audio("audio.wav") if speech_text: st.session_state.messages.append(("user", speech_text)) - send_response(speech_text, "You have a great voice") - logging.info("Audio transcribed successfully.") + #send_response(speech_text, "You have a great voice") + send_response(speech_text) + logging.info("Audio transcribed and response generated successfully.") else: # st.session_state.messages.append(("assistant", )) send_response(speech_text, "Sorry, I couldn't transcribe your audio. Please try again.") diff --git a/src/rag_functions.py b/src/rag_functions.py index 89207c0..f4ec846 100644 --- a/src/rag_functions.py +++ b/src/rag_functions.py @@ -8,6 +8,10 @@ from azure.ai.formrecognizer import DocumentAnalysisClient from azure.core.credentials import AzureKeyCredential from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.embeddings import OpenAIEmbeddings +from langchain_community.vectorstores import DocArrayInMemorySearch +from langchain.schema import Document + # Setting up logging logging.basicConfig(level=logging.INFO) @@ -129,3 +133,45 @@ def extract_contents_from_doc(files, temp_dir): continue # Proceed with the next file in case of an error return extracted_file_paths + +#function to embed the chunks created on docs and initializing a vector store +def create_vector_store(extracted_file_paths): + """ + Embeds the documents and initializes a DocArrayInMemorySearch vector store. + + Args: + extracted_file_path: A path containing the contents extracted from the documents uploaded + + Returns: + DocArrayInMemorySearch: An initialized vector store with embedded documents. + """ + try: + #OpenAI Embedding settings + openai_embeddings = OpenAIEmbeddings( + openai_api_version=os.getenv("OPENAI_API_VERSION"), + openai_api_key=os.getenv("API_KEY"), + openai_api_base=os.getenv("ENDPOINT"), + openai_api_type="azure", + deployment="text-embedding-ada-002" + ) + logger.info("OpenAI Embeddings initialized successfully.") + docs = [] + for file_path in extracted_file_paths: + try: + with open(file_path, "r", encoding="utf-8") as file: + text = file.read() + chunks = chunk_document(text) + docs.extend([Document(page_content=chunk) for chunk in chunks]) + logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.") + except Exception as e: + logger.error(f"Error reading or chunking file '{file_path}': {e}") + continue + + #initializing the vector store + vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings) + logger.info("DocArrayInMemorySearch vector store initialized successfully.") + + return vector_store + + except Exception as e: + logger.exception(f"An error occurred while initializing the vector store: {e}") From 4b746454f4462f459b930bcac06ceb24b106e971 Mon Sep 17 00:00:00 2001 From: Odeyiany2 Date: Sat, 19 Oct 2024 08:42:12 +0100 Subject: [PATCH 2/6] Added a function for DocArrayInMemorySearch --- main.py | 99 ++++++++++++++++++++------------------------ src/rag_functions.py | 45 -------------------- 2 files changed, 44 insertions(+), 100 deletions(-) diff --git a/main.py b/main.py index 71af378..839a37b 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,10 @@ from dotenv import load_dotenv from src.speech_io import transcribe_audio, synthesize_speech from src.rag_functions import (allowed_files, file_check_num, - extract_contents_from_doc, create_vector_store, logger) + extract_contents_from_doc, chunk_document, logger) +from langchain.embeddings import OpenAIEmbeddings +from langchain_community.vectorstores import DocArrayInMemorySearch +from langchain.schema import Document from langchain.chains.retrieval import create_retrieval_chain from langchain.chains.history_aware_retriever import create_history_aware_retriever from langchain.chat_models import ChatOpenAI @@ -54,52 +57,47 @@ def get_llm() -> ChatOpenAI: llm = get_llm() - -def query_response(query, vector_store): +#function to embed the chunks created on docs and initializing a vector store +def create_vector_store(extracted_file_paths): """ - Generates a response to the user's query using the vector store and the language model. - Uses Langchains retrieval library + Embeds the documents and initializes a DocArrayInMemorySearch vector store. Args: - query (str): The user's input query. - vector_store (DocArrayInMemorySearch): The initialized vector store + extracted_file_path: A path containing the contents extracted from the documents uploaded Returns: - str: The generated response. + DocArrayInMemorySearch: An initialized vector store with embedded documents. """ try: - llm = get_llm() - #prompting for the llm - prompt_template = """Use the following excerpts to answer a query. If you can't find the answer from the provided document, - don't try to make up an answer. Just say "I can't find the answer from the provided document but you may want to check the following links". - - Context: {context} - - Question: {question} - - Helpful Answer: - """ - qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) - # Create history-aware retriever - history_aware_retriever = create_history_aware_retriever( - llm, - vector_store.as_retriever(search_type="similarity", - search_kwargs={"k": 3},), - qa_prompt,) - #initializing a question answer chain - question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) - - #query retrieval chain - chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) - - #retrieve answer - response = chain.invoke({"question": query}) - logger.info("Response successfully generated") - return response["answer"] - + #OpenAI Embedding settings + openai_embeddings = OpenAIEmbeddings( + openai_api_version=os.getenv("OPENAI_API_VERSION"), + openai_api_key=os.getenv("API_KEY"), + openai_api_base=os.getenv("ENDPOINT"), + openai_api_type="azure", + deployment="text-embedding-ada-002" + ) + logger.info("OpenAI Embeddings initialized successfully.") + docs = [] + for file_path in extracted_file_paths: + try: + with open(file_path, "r", encoding="utf-8") as file: + text = file.read() + chunks = chunk_document(text) + docs.extend([Document(page_content=chunk) for chunk in chunks]) + logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.") + except Exception as e: + logger.error(f"Error reading or chunking file '{file_path}': {e}") + continue + + #initializing the vector store + vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings) + logger.info("DocArrayInMemorySearch vector store initialized successfully.") + + return vector_store + except Exception as e: - logger.error(f"Error occurred in generating response:{e}") - return "Sorry, I couldn't process your request at the moment." + logger.exception(f"An error occurred while initializing the vector store: {e}") # Sidebar configuration for file uploads @@ -152,19 +150,11 @@ def query_response(query, vector_store): def send_response(message, response=None): - # dummy_response = "Hello. How are you?" - # st.session_state.messages.append(('assistant', response or dummy_response)) + dummy_response = "Hello. How are you?" + st.session_state.messages.append(('assistant', response or dummy_response)) # TODO: make async ?? - vector_store = st.session_state['vector_store'] - - # Get the response from query_response - answer = query_response(message, vector_store) - - # Append the assistant's response to messages - st.session_state.messages.append(('assistant', answer)) - - print(answer) - synthesize_speech(text=answer) + print(response or dummy_response) + synthesize_speech(text=response or dummy_response) # Chat area and audio input handling @@ -194,9 +184,8 @@ def handle_audio_message(): speech_text = transcribe_audio("audio.wav") if speech_text: st.session_state.messages.append(("user", speech_text)) - #send_response(speech_text, "You have a great voice") - send_response(speech_text) - logging.info("Audio transcribed and response generated successfully.") + send_response(speech_text, "You have a great voice") + logging.info("Audio transcribed successfully.") else: # st.session_state.messages.append(("assistant", )) send_response(speech_text, "Sorry, I couldn't transcribe your audio. Please try again.") @@ -217,4 +206,4 @@ def handle_audio_message(): # Handle audio input from user -audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message) +audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message) \ No newline at end of file diff --git a/src/rag_functions.py b/src/rag_functions.py index f4ec846..9c02683 100644 --- a/src/rag_functions.py +++ b/src/rag_functions.py @@ -8,9 +8,6 @@ from azure.ai.formrecognizer import DocumentAnalysisClient from azure.core.credentials import AzureKeyCredential from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.embeddings import OpenAIEmbeddings -from langchain_community.vectorstores import DocArrayInMemorySearch -from langchain.schema import Document # Setting up logging @@ -133,45 +130,3 @@ def extract_contents_from_doc(files, temp_dir): continue # Proceed with the next file in case of an error return extracted_file_paths - -#function to embed the chunks created on docs and initializing a vector store -def create_vector_store(extracted_file_paths): - """ - Embeds the documents and initializes a DocArrayInMemorySearch vector store. - - Args: - extracted_file_path: A path containing the contents extracted from the documents uploaded - - Returns: - DocArrayInMemorySearch: An initialized vector store with embedded documents. - """ - try: - #OpenAI Embedding settings - openai_embeddings = OpenAIEmbeddings( - openai_api_version=os.getenv("OPENAI_API_VERSION"), - openai_api_key=os.getenv("API_KEY"), - openai_api_base=os.getenv("ENDPOINT"), - openai_api_type="azure", - deployment="text-embedding-ada-002" - ) - logger.info("OpenAI Embeddings initialized successfully.") - docs = [] - for file_path in extracted_file_paths: - try: - with open(file_path, "r", encoding="utf-8") as file: - text = file.read() - chunks = chunk_document(text) - docs.extend([Document(page_content=chunk) for chunk in chunks]) - logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.") - except Exception as e: - logger.error(f"Error reading or chunking file '{file_path}': {e}") - continue - - #initializing the vector store - vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings) - logger.info("DocArrayInMemorySearch vector store initialized successfully.") - - return vector_store - - except Exception as e: - logger.exception(f"An error occurred while initializing the vector store: {e}") From 3d42cf1906cbf3d06e6b26b055eb6c4be861df67 Mon Sep 17 00:00:00 2001 From: Odeyiany2 Date: Sat, 19 Oct 2024 08:44:42 +0100 Subject: [PATCH 3/6] Added a function for DocArrayInMemorySearch --- main.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/main.py b/main.py index 839a37b..21eb8c6 100644 --- a/main.py +++ b/main.py @@ -8,12 +8,8 @@ from langchain.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import DocArrayInMemorySearch from langchain.schema import Document -from langchain.chains.retrieval import create_retrieval_chain -from langchain.chains.history_aware_retriever import create_history_aware_retriever from langchain.chat_models import ChatOpenAI from langchain.embeddings import OpenAIEmbeddings -from langchain.chains.combine_documents import create_stuff_documents_chain -from langchain_core.prompts import PromptTemplate import openai # Set up page configuration From d64af354f1bfd459a751fc456b382524885ddd11 Mon Sep 17 00:00:00 2001 From: Samuel Bamgbola Date: Sat, 19 Oct 2024 20:10:00 +0100 Subject: [PATCH 4/6] Removed redundant newline --- src/rag_functions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/rag_functions.py b/src/rag_functions.py index 9c02683..89207c0 100644 --- a/src/rag_functions.py +++ b/src/rag_functions.py @@ -9,7 +9,6 @@ from azure.core.credentials import AzureKeyCredential from langchain.text_splitter import RecursiveCharacterTextSplitter - # Setting up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) From 15fbade4439d5a276f5cb72e6b8ba9b82599bf7e Mon Sep 17 00:00:00 2001 From: Odeyiany2 Date: Mon, 21 Oct 2024 14:16:24 +0100 Subject: [PATCH 5/6] Updated Azure OpenAI Embedding Model --- main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 21eb8c6..25a66f7 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ from src.speech_io import transcribe_audio, synthesize_speech from src.rag_functions import (allowed_files, file_check_num, extract_contents_from_doc, chunk_document, logger) -from langchain.embeddings import OpenAIEmbeddings +from langchain.embeddings import OpenAIEmbeddings, AzureOpenAIEmbeddings from langchain_community.vectorstores import DocArrayInMemorySearch from langchain.schema import Document from langchain.chat_models import ChatOpenAI @@ -66,8 +66,10 @@ def create_vector_store(extracted_file_paths): """ try: #OpenAI Embedding settings - openai_embeddings = OpenAIEmbeddings( + openai_embeddings = AzureOpenAIEmbeddings( openai_api_version=os.getenv("OPENAI_API_VERSION"), + chunk_size= 1024, + validate_base_url=True, # Explicitly provide validate_base_url openai_api_key=os.getenv("API_KEY"), openai_api_base=os.getenv("ENDPOINT"), openai_api_type="azure", From a5999f307ad9efb789e22845b1ed681b20182545 Mon Sep 17 00:00:00 2001 From: Odeyiany2 Date: Mon, 21 Oct 2024 17:04:14 +0100 Subject: [PATCH 6/6] Updated Function DocArrayInMemorySearch --- main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 25a66f7..21eb8c6 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ from src.speech_io import transcribe_audio, synthesize_speech from src.rag_functions import (allowed_files, file_check_num, extract_contents_from_doc, chunk_document, logger) -from langchain.embeddings import OpenAIEmbeddings, AzureOpenAIEmbeddings +from langchain.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import DocArrayInMemorySearch from langchain.schema import Document from langchain.chat_models import ChatOpenAI @@ -66,10 +66,8 @@ def create_vector_store(extracted_file_paths): """ try: #OpenAI Embedding settings - openai_embeddings = AzureOpenAIEmbeddings( + openai_embeddings = OpenAIEmbeddings( openai_api_version=os.getenv("OPENAI_API_VERSION"), - chunk_size= 1024, - validate_base_url=True, # Explicitly provide validate_base_url openai_api_key=os.getenv("API_KEY"), openai_api_base=os.getenv("ENDPOINT"), openai_api_type="azure",