Skip to content
This repository has been archived by the owner on Dec 9, 2024. It is now read-only.

Commit

Permalink
Merge pull request #204 from DrAlzahraniProjects/enhancements
Browse files Browse the repository at this point in the history
Enhancements (faster loading when refresh + removed duplicates)
  • Loading branch information
smrchanda877 authored Nov 24, 2024
2 parents e21d3d9 + f1aeef4 commit af0e4f3
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 10 deletions.
34 changes: 27 additions & 7 deletions RAG.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,17 @@ async def _load_documents_from_web_and_db(collection: Collection):

def initialize_milvus(uri: str=MILVUS_URI):
"""
Initialize the vector store for the RAG model
Initialize the Milvus database with the vector store
Args:
uri (str, optional): Path to the local milvus db. Defaults to MILVUS_URI.
Returns:
vector_store: The vector store created
uri (str, optional): The URI of the Milvus database. Defaults to MILVUS_URI.
"""
connections.connect("default",uri=MILVUS_URI)


if os.environ.get("vector_store_initialized", False):
# passing an empty list to just load the vector store
create_vector_store([])
return
if vector_store_check(uri):
collection = Collection(re.sub(r'\W+', '', CORPUS_SOURCE))
documents, existing_hashes = asyncio.run(_load_documents_from_web_and_db(collection))
Expand Down Expand Up @@ -296,7 +297,26 @@ def split_documents(documents):
)
# Split the documents into chunks
docs = text_splitter.split_documents(documents)
return docs
unique_docs = remove_duplicates(docs)
return unique_docs

def remove_duplicates(documents):
"""
Remove duplicate documents based on the page content
Args:
documents (list): The list of documents to remove duplicates from
Returns:
list: The list of unique documents
"""
seen_content = set()
unique_documents = []
for doc in documents:
if doc.page_content not in seen_content:
seen_content.add(doc.page_content)
unique_documents.append(doc)
return unique_documents

def vector_store_check(uri):
"""
Expand Down
26 changes: 23 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,39 @@


def initialize_vector_store():
if not hasattr(st.session_state, "vector_store_initialized"):
"""
Initialize the vector store for storing the embeddings of the questions.
"""
if not hasattr(os.environ, "vector_store_initialized"):
initialize_milvus()
st.session_state.vector_store_initialized = True

os.environ.vector_store_initialized = True

def remove_special_characters(input_string):
"""
Remove special characters from a string and convert it to lowercase.
Args:
input_string (str): The input string.
Returns:
str: The string with special characters removed and converted to lowercase.
"""
special_characters = "?!@#$%^&*()-_=+[]{}\\|;:'\",<>/`~"
return input_string.translate(str.maketrans("", "", special_characters)).lower().strip()


class StreamlitApp:
"""
Streamlit App class to handle the chatbot app.
"""

def __init__(self, session_state=st.session_state):
"""
Initialize the Streamlit App.
Args:
session_state (dict): The session state dictionary.
"""
self.db_client = DatabaseClient()
if "app_initialized" not in st.session_state:
st.session_state.app_initialized = False
Expand Down
Binary file modified milvus/milvus_vector.db
Binary file not shown.

0 comments on commit af0e4f3

Please sign in to comment.