Merge pull request #204 from DrAlzahraniProjects/enhancements

Enhancements (faster loading when refresh + removed duplicates)
DrAlzahraniProjects · Nov 24, 2024 · af0e4f3 · af0e4f3
2 parents e21d3d9 + f1aeef4
commit af0e4f3
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 10 deletions.
diff --git a/RAG.py b/RAG.py
@@ -168,16 +168,17 @@ async def _load_documents_from_web_and_db(collection: Collection):
 
 def initialize_milvus(uri: str=MILVUS_URI):
     """
-    Initialize the vector store for the RAG model
+    Initialize the Milvus database with the vector store
 
     Args:
-        uri (str, optional): Path to the local milvus db. Defaults to MILVUS_URI.
-
-    Returns:
-        vector_store: The vector store created
+        uri (str, optional): The URI of the Milvus database. Defaults to MILVUS_URI.
     """
     connections.connect("default",uri=MILVUS_URI)
-
+
+    if os.environ.get("vector_store_initialized", False):
+        # passing an empty list to just load the vector store
+        create_vector_store([])
+        return
     if vector_store_check(uri):
         collection = Collection(re.sub(r'\W+', '', CORPUS_SOURCE))
         documents, existing_hashes = asyncio.run(_load_documents_from_web_and_db(collection))
@@ -296,7 +297,26 @@ def split_documents(documents):
     )
     # Split the documents into chunks
     docs = text_splitter.split_documents(documents)
-    return docs
+    unique_docs = remove_duplicates(docs)
+    return unique_docs
+
+def remove_duplicates(documents):
+    """
+    Remove duplicate documents based on the page content
+
+    Args:
+        documents (list): The list of documents to remove duplicates from
+
+    Returns:
+        list: The list of unique documents
+    """
+    seen_content = set()
+    unique_documents = []
+    for doc in documents:
+        if doc.page_content not in seen_content:
+            seen_content.add(doc.page_content)
+            unique_documents.append(doc)
+    return unique_documents
 
 def vector_store_check(uri):
     """

diff --git a/app.py b/app.py
@@ -11,19 +11,39 @@
 
 
 def initialize_vector_store():
-    if not hasattr(st.session_state, "vector_store_initialized"):
+    """
+    Initialize the vector store for storing the embeddings of the questions.
+    """
+    if not hasattr(os.environ, "vector_store_initialized"):
         initialize_milvus()
-        st.session_state.vector_store_initialized = True
-
+        os.environ.vector_store_initialized = True
 
 def remove_special_characters(input_string):
+    """
+    Remove special characters from a string and convert it to lowercase.
+
+    Args:
+        input_string (str): The input string.
+
+    Returns:
+        str: The string with special characters removed and converted to lowercase.
+    """
     special_characters = "?!@#$%^&*()-_=+[]{}\\|;:'\",<>/`~"
     return input_string.translate(str.maketrans("", "", special_characters)).lower().strip()
 
 
 class StreamlitApp:
+    """
+    Streamlit App class to handle the chatbot app.
+    """
 
     def __init__(self, session_state=st.session_state):
+        """
+        Initialize the Streamlit App.
+
+        Args:
+            session_state (dict): The session state dictionary.
+        """
         self.db_client = DatabaseClient()
         if "app_initialized" not in st.session_state:
             st.session_state.app_initialized = False

diff --git a/milvus/milvus_vector.db b/milvus/milvus_vector.db