Merge pull request #36 from Odeyiany2/main

Added a Function for DocArrayInMemorySearch
mlsanigeria · Oct 21, 2024 · e087acf · e087acf
2 parents 03968e3 + a5999f3
commit e087acf
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 12 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -117,3 +117,4 @@ The project is structured as follows:
 7. [Azure OpenAI Models: Deployment](https://learn.microsoft.com/azure/ai-services/openai/how-to/working-with-models?tabs=powershell?wt.mc_id=studentamb_405806)
 8. [Azure Speech Service documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/?wt.mc_id=studentamb_217190)
 9. [Develop Generative AI solutions with Azure OpenAI Service](https://learn.microsoft.com/en-us/training/paths/develop-ai-solutions-azure-openai/?wt.mc_id=studentamb_217190)
+10. [Langchain's DocArrayInMemoryStore Documentation](https://python.langchain.com/docs/integrations/vectorstores/docarray_in_memory/)
diff --git a/main.py b/main.py
@@ -3,8 +3,11 @@
 import logging
 from dotenv import load_dotenv
 from src.speech_io import transcribe_audio, synthesize_speech
-from src.rag_functions import allowed_files, file_check_num, extract_contents_from_doc
-from langchain.chains import RetrievalQA
+from src.rag_functions import (allowed_files, file_check_num, 
+                               extract_contents_from_doc, chunk_document, logger)
+from langchain.embeddings import OpenAIEmbeddings
+from langchain_community.vectorstores import DocArrayInMemorySearch
+from langchain.schema import Document
 from langchain.chat_models import ChatOpenAI
 from langchain.embeddings import OpenAIEmbeddings
 import openai
@@ -29,13 +32,13 @@ def get_llm() -> ChatOpenAI:
         openai.api_type = "azure"
         openai.api_version = os.getenv("OPENAI_API_VERSION")
 
-        # OpenAI Settings
-        openai_embeddings = OpenAIEmbeddings(
-            openai_api_version=os.getenv("OPENAI_API_VERSION"), 
-            openai_api_key=os.getenv("API_KEY"),
-            openai_api_base=os.getenv("ENDPOINT"), 
-            openai_api_type="azure"
-        )
+        # # OpenAI Settings
+        # openai_embeddings = OpenAIEmbeddings(
+        #     openai_api_version=os.getenv("OPENAI_API_VERSION"), 
+        #     openai_api_key=os.getenv("API_KEY"),
+        #     openai_api_base=os.getenv("ENDPOINT"), 
+        #     openai_api_type="azure"
+        # )
 
         llm = ChatOpenAI(
             temperature=0.3, openai_api_key=os.getenv("API_KEY"), 
@@ -51,6 +54,49 @@ def get_llm() -> ChatOpenAI:
 
 llm = get_llm()
 
+#function to embed the chunks created on docs and initializing a vector store
+def create_vector_store(extracted_file_paths):
+    """
+    Embeds the documents and initializes a DocArrayInMemorySearch vector store.
+
+    Args:
+        extracted_file_path: A path containing the contents extracted from the documents uploaded
+
+    Returns:
+        DocArrayInMemorySearch: An initialized vector store with embedded documents.
+    """
+    try:
+        #OpenAI Embedding settings
+        openai_embeddings = OpenAIEmbeddings(
+                openai_api_version=os.getenv("OPENAI_API_VERSION"), 
+                openai_api_key=os.getenv("API_KEY"),
+                openai_api_base=os.getenv("ENDPOINT"), 
+                openai_api_type="azure",
+                deployment="text-embedding-ada-002"
+            )
+        logger.info("OpenAI Embeddings initialized successfully.")
+        docs = []
+        for file_path in extracted_file_paths:
+            try:
+                with open(file_path, "r", encoding="utf-8") as file:
+                    text = file.read()
+                chunks = chunk_document(text)
+                docs.extend([Document(page_content=chunk) for chunk in chunks])
+                logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.")
+            except Exception as e:
+                logger.error(f"Error reading or chunking file '{file_path}': {e}")
+                continue
+
+        #initializing the vector store
+        vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings)
+        logger.info("DocArrayInMemorySearch vector store initialized successfully.")
+
+        return vector_store
+
+    except Exception as e:
+        logger.exception(f"An error occurred while initializing the vector store: {e}")
+
+
 # Sidebar configuration for file uploads
 if 'uploaded_files' not in st.session_state:
     st.session_state.uploaded_files = None
@@ -88,8 +134,11 @@ def get_llm() -> ChatOpenAI:
             if valid_file and valid_files:
                 try:
                     extraction_results = extract_contents_from_doc(valid_files, "temp_dir")
-                    st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.")
-                    logging.info("File(s) uploaded and processed successfully.")
+                    vector_store = create_vector_store(extraction_results)
+                    if vector_store:
+                        st.session_state['vector_store'] = vector_store
+                        st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.")
+                        logging.info("File(s) uploaded and processed successfully.")
                 except Exception as e:
                     st.error("An error occurred while processing your document. Please try again.")
                     logging.error(f"Error extracting content from document: {e}")
@@ -164,4 +213,4 @@ def handle_audio_message():
 
 
 # Handle audio input from user
-audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message)
+audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message)