Skip to content

Commit

Permalink
Merge pull request #36 from Odeyiany2/main
Browse files Browse the repository at this point in the history
Added a Function for DocArrayInMemorySearch
  • Loading branch information
Sammybams authored Oct 21, 2024
2 parents 03968e3 + a5999f3 commit e087acf
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 12 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,4 @@ The project is structured as follows:
7. [Azure OpenAI Models: Deployment](https://learn.microsoft.com/azure/ai-services/openai/how-to/working-with-models?tabs=powershell?wt.mc_id=studentamb_405806)
8. [Azure Speech Service documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/?wt.mc_id=studentamb_217190)
9. [Develop Generative AI solutions with Azure OpenAI Service](https://learn.microsoft.com/en-us/training/paths/develop-ai-solutions-azure-openai/?wt.mc_id=studentamb_217190)
10. [Langchain's DocArrayInMemoryStore Documentation](https://python.langchain.com/docs/integrations/vectorstores/docarray_in_memory/)
73 changes: 61 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import logging
from dotenv import load_dotenv
from src.speech_io import transcribe_audio, synthesize_speech
from src.rag_functions import allowed_files, file_check_num, extract_contents_from_doc
from langchain.chains import RetrievalQA
from src.rag_functions import (allowed_files, file_check_num,
extract_contents_from_doc, chunk_document, logger)
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
import openai
Expand All @@ -29,13 +32,13 @@ def get_llm() -> ChatOpenAI:
openai.api_type = "azure"
openai.api_version = os.getenv("OPENAI_API_VERSION")

# OpenAI Settings
openai_embeddings = OpenAIEmbeddings(
openai_api_version=os.getenv("OPENAI_API_VERSION"),
openai_api_key=os.getenv("API_KEY"),
openai_api_base=os.getenv("ENDPOINT"),
openai_api_type="azure"
)
# # OpenAI Settings
# openai_embeddings = OpenAIEmbeddings(
# openai_api_version=os.getenv("OPENAI_API_VERSION"),
# openai_api_key=os.getenv("API_KEY"),
# openai_api_base=os.getenv("ENDPOINT"),
# openai_api_type="azure"
# )

llm = ChatOpenAI(
temperature=0.3, openai_api_key=os.getenv("API_KEY"),
Expand All @@ -51,6 +54,49 @@ def get_llm() -> ChatOpenAI:

llm = get_llm()

#function to embed the chunks created on docs and initializing a vector store
def create_vector_store(extracted_file_paths):
"""
Embeds the documents and initializes a DocArrayInMemorySearch vector store.
Args:
extracted_file_path: A path containing the contents extracted from the documents uploaded
Returns:
DocArrayInMemorySearch: An initialized vector store with embedded documents.
"""
try:
#OpenAI Embedding settings
openai_embeddings = OpenAIEmbeddings(
openai_api_version=os.getenv("OPENAI_API_VERSION"),
openai_api_key=os.getenv("API_KEY"),
openai_api_base=os.getenv("ENDPOINT"),
openai_api_type="azure",
deployment="text-embedding-ada-002"
)
logger.info("OpenAI Embeddings initialized successfully.")
docs = []
for file_path in extracted_file_paths:
try:
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
chunks = chunk_document(text)
docs.extend([Document(page_content=chunk) for chunk in chunks])
logger.info(f"Document {file_path} chunked into {len(chunks)} chunks.")
except Exception as e:
logger.error(f"Error reading or chunking file '{file_path}': {e}")
continue

#initializing the vector store
vector_store = DocArrayInMemorySearch.from_documents(docs, openai_embeddings)
logger.info("DocArrayInMemorySearch vector store initialized successfully.")

return vector_store

except Exception as e:
logger.exception(f"An error occurred while initializing the vector store: {e}")


# Sidebar configuration for file uploads
if 'uploaded_files' not in st.session_state:
st.session_state.uploaded_files = None
Expand Down Expand Up @@ -88,8 +134,11 @@ def get_llm() -> ChatOpenAI:
if valid_file and valid_files:
try:
extraction_results = extract_contents_from_doc(valid_files, "temp_dir")
st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.")
logging.info("File(s) uploaded and processed successfully.")
vector_store = create_vector_store(extraction_results)
if vector_store:
st.session_state['vector_store'] = vector_store
st.success(f"{len(st.session_state.uploaded_files)} file(s) uploaded and processed successfully.")
logging.info("File(s) uploaded and processed successfully.")
except Exception as e:
st.error("An error occurred while processing your document. Please try again.")
logging.error(f"Error extracting content from document: {e}")
Expand Down Expand Up @@ -164,4 +213,4 @@ def handle_audio_message():


# Handle audio input from user
audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message)
audio_value = st.experimental_audio_input("Record a voice message", key="audio_prompt", on_change=handle_audio_message)

0 comments on commit e087acf

Please sign in to comment.