Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed Chats from Media DB (BREAKING CHANGE - moved to RAG QA db), Added mindmap viewing, token counts in convos, anki flashcard validation #409

Merged
merged 46 commits into from
Nov 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
d839d10
Update README.md
rmusser01 Oct 26, 2024
afd655c
Update README.md
rmusser01 Oct 26, 2024
87caad1
well, here we go
rmusser01 Oct 26, 2024
a40c8d6
App boots successfully
rmusser01 Oct 26, 2024
02cc0bb
I fucking hate claude 3.6. Holy shit.
rmusser01 Oct 27, 2024
e30200f
Can save/update chats from basic 'Chat with an LLM'
rmusser01 Oct 27, 2024
cc66410
Update Chat_Workflows.py
rmusser01 Oct 27, 2024
0c68832
Update test_enhanced_rag_pipeline.py
rmusser01 Oct 27, 2024
a9a4aa5
Checkpoint
rmusser01 Oct 27, 2024
71db360
Now possible to view Embeddings for the Different DBs
rmusser01 Oct 27, 2024
fc88b40
Update RAG_Library_2.py
rmusser01 Oct 27, 2024
9f1ecbd
Checkpoint
rmusser01 Oct 27, 2024
9ad681e
DB Selection works
rmusser01 Oct 27, 2024
a814e42
RAG Chat Search
rmusser01 Oct 27, 2024
7f986fb
checkpoint
rmusser01 Oct 28, 2024
4c25b62
Update RAG_QA_Chat_tab.py
rmusser01 Oct 28, 2024
71ec096
checkpoint
rmusser01 Oct 29, 2024
c214067
Update Gradio_Related.py
rmusser01 Oct 29, 2024
a341975
checkpoint
rmusser01 Oct 29, 2024
04916c2
checkpoint
rmusser01 Oct 30, 2024
467ea82
and holy shit we have anki deck creation
rmusser01 Oct 30, 2024
88495a4
Anki flashcard creation prompt
rmusser01 Oct 30, 2024
4d07997
Update Gradio_Related.py
rmusser01 Oct 30, 2024
f5f0519
Update Writing_tab.py
rmusser01 Oct 30, 2024
d23e509
Show custom prompts in RAG QA Chat + Merge Media DB Search into the t…
rmusser01 Oct 30, 2024
c558cc4
spelling
rmusser01 Oct 30, 2024
9ccb206
Checkpoint
rmusser01 Oct 31, 2024
2164daa
Update Book_Ingestion_tab.py
rmusser01 Oct 31, 2024
c82b94e
Another checkpoint
rmusser01 Oct 31, 2024
5d0b465
PDFs
rmusser01 Oct 31, 2024
e6e7385
hey it works (I think)
rmusser01 Oct 31, 2024
88ec830
checkpoint
rmusser01 Oct 31, 2024
5660c51
DB Migration script partial
rmusser01 Oct 31, 2024
dd3127a
Update RAG_QA_Chat_DB.py
rmusser01 Oct 31, 2024
0401802
Update DB_Manager.py
rmusser01 Oct 31, 2024
78850ad
Update DB_Migration.py
rmusser01 Oct 31, 2024
57a43b1
don't know who's more retarded, me or claude
rmusser01 Oct 31, 2024
568aeb8
Update DB_Backups.py
rmusser01 Oct 31, 2024
beddab0
Update DB_Backups.py
rmusser01 Oct 31, 2024
6b891bf
RAG Test works now...
rmusser01 Nov 1, 2024
b9ac61e
Update ChromaDB_Library.py
rmusser01 Nov 1, 2024
b42d37c
wew.
rmusser01 Nov 1, 2024
91c6ed6
Update SQLite_DB.py
rmusser01 Nov 1, 2024
b35881c
Update blank-front.png
rmusser01 Nov 1, 2024
35a739b
Update RAG_QA_Chat_tab.py
rmusser01 Nov 1, 2024
657aec0
Fixed Tests + Fixed Note Search + Migration script fixed
rmusser01 Nov 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ jobs:
cd ./Tests/RAG
pytest test_RAG_Library_2.py

- name: Test RAG Notes functions with pytest
run: |
pwd
cd ./Tests/RAG_QA_Chat
pytest test_notes_search.py

- name: Test SQLite lib functions with pytest
run: |
pwd
Expand Down
357 changes: 131 additions & 226 deletions App_Function_Libraries/Audio/Audio_Files.py

Large diffs are not rendered by default.

172 changes: 83 additions & 89 deletions App_Function_Libraries/Books/Book_Ingestion_Lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,109 +385,103 @@ def process_markdown_content(markdown_content, file_path, title, author, keyword
return f"Document '{title}' imported successfully. Database result: {result}"


def import_file_handler(file,
title,
author,
keywords,
system_prompt,
custom_prompt,
auto_summarize,
api_name,
api_key,
max_chunk_size,
chunk_overlap,
custom_chapter_pattern
):
def import_file_handler(files,
author,
keywords,
system_prompt,
custom_prompt,
auto_summarize,
api_name,
api_key,
max_chunk_size,
chunk_overlap,
custom_chapter_pattern):
try:
log_counter("file_import_attempt", labels={"file_name": file.name})

# Handle max_chunk_size
if isinstance(max_chunk_size, str):
max_chunk_size = int(max_chunk_size) if max_chunk_size.strip() else 4000
elif not isinstance(max_chunk_size, int):
max_chunk_size = 4000 # Default value if not a string or int

# Handle chunk_overlap
if isinstance(chunk_overlap, str):
chunk_overlap = int(chunk_overlap) if chunk_overlap.strip() else 0
elif not isinstance(chunk_overlap, int):
chunk_overlap = 0 # Default value if not a string or int

chunk_options = {
'method': 'chapter',
'max_size': max_chunk_size,
'overlap': chunk_overlap,
'custom_chapter_pattern': custom_chapter_pattern if custom_chapter_pattern else None
}
if not files:
return "No files uploaded."

if file is None:
log_counter("file_import_error", labels={"error": "No file uploaded"})
return "No file uploaded."
# Convert single file to list for consistent processing
if not isinstance(files, list):
files = [files]

file_path = file.name
if not os.path.exists(file_path):
log_counter("file_import_error", labels={"error": "File not found", "file_name": file.name})
return "Uploaded file not found."
results = []
for file in files:
log_counter("file_import_attempt", labels={"file_name": file.name})

start_time = datetime.now()
# Handle max_chunk_size and chunk_overlap
chunk_size = int(max_chunk_size) if isinstance(max_chunk_size, (str, int)) else 4000
overlap = int(chunk_overlap) if isinstance(chunk_overlap, (str, int)) else 0

if file_path.lower().endswith('.epub'):
status = import_epub(
file_path,
title,
author,
keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options,
custom_chapter_pattern=custom_chapter_pattern
)
log_counter("epub_import_success", labels={"file_name": file.name})
result = f"📚 EPUB Imported Successfully:\n{status}"
elif file.name.lower().endswith('.zip'):
status = process_zip_file(
zip_file=file,
title=title,
author=author,
keywords=keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options
)
log_counter("zip_import_success", labels={"file_name": file.name})
result = f"📦 ZIP Processed Successfully:\n{status}"
elif file.name.lower().endswith(('.chm', '.html', '.pdf', '.xml', '.opml')):
file_type = file.name.split('.')[-1].upper()
log_counter("unsupported_file_type", labels={"file_type": file_type})
result = f"{file_type} file import is not yet supported."
else:
log_counter("unsupported_file_type", labels={"file_type": file.name.split('.')[-1]})
result = "❌ Unsupported file type. Please upload an `.epub` file or a `.zip` file containing `.epub` files."
chunk_options = {
'method': 'chapter',
'max_size': chunk_size,
'overlap': overlap,
'custom_chapter_pattern': custom_chapter_pattern if custom_chapter_pattern else None
}

end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
log_histogram("file_import_duration", processing_time, labels={"file_name": file.name})
file_path = file.name
if not os.path.exists(file_path):
results.append(f"❌ File not found: {file.name}")
continue

return result
start_time = datetime.now()

# Extract title from filename
title = os.path.splitext(os.path.basename(file_path))[0]

if file_path.lower().endswith('.epub'):
status = import_epub(
file_path,
title=title, # Use filename as title
author=author,
keywords=keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options,
custom_chapter_pattern=custom_chapter_pattern
)
log_counter("epub_import_success", labels={"file_name": file.name})
results.append(f"📚 {file.name}: {status}")

elif file_path.lower().endswith('.zip'):
status = process_zip_file(
zip_file=file,
title=None, # Let each file use its own name
author=author,
keywords=keywords,
custom_prompt=custom_prompt,
system_prompt=system_prompt,
summary=None,
auto_summarize=auto_summarize,
api_name=api_name,
api_key=api_key,
chunk_options=chunk_options
)
log_counter("zip_import_success", labels={"file_name": file.name})
results.append(f"📦 {file.name}: {status}")
else:
results.append(f"❌ Unsupported file type: {file.name}")
continue

end_time = datetime.now()
processing_time = (end_time - start_time).total_seconds()
log_histogram("file_import_duration", processing_time, labels={"file_name": file.name})

return "\n\n".join(results)

except ValueError as ve:
logging.exception(f"Error parsing input values: {str(ve)}")
log_counter("file_import_error", labels={"error": "Invalid input", "file_name": file.name})
return f"❌ Error: Invalid input for chunk size or overlap. Please enter valid numbers."
except Exception as e:
logging.exception(f"Error during file import: {str(e)}")
log_counter("file_import_error", labels={"error": str(e), "file_name": file.name})
return f"❌ Error during import: {str(e)}"



def read_epub(file_path):
"""
Reads and extracts text from an EPUB file.
Expand Down Expand Up @@ -568,9 +562,9 @@ def ingest_text_file(file_path, title=None, author=None, keywords=None):

# Add the text file to the database
add_media_with_keywords(
url=file_path,
url="its_a_book",
title=title,
media_type='document',
media_type='book',
content=content,
keywords=keywords,
prompt='No prompt for text files',
Expand Down
86 changes: 45 additions & 41 deletions App_Function_Libraries/Chat/Chat_Functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import os
import re
import sqlite3
import tempfile
import time
from datetime import datetime
Expand All @@ -14,7 +15,8 @@
# External Imports
#
# Local Imports
from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database
from App_Function_Libraries.DB.DB_Manager import start_new_conversation, delete_messages_in_conversation, save_message
from App_Function_Libraries.DB.RAG_QA_Chat_DB import get_db_connection, get_conversation_name
from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface
from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
Expand Down Expand Up @@ -183,56 +185,58 @@ def save_chat_history_to_db_wrapper(chatbot, conversation_id, media_content, med
log_counter("save_chat_history_to_db_attempt")
start_time = time.time()
logging.info(f"Attempting to save chat history. Media content type: {type(media_content)}")

try:
# Extract the media_id and media_name from the media_content
media_id = None
if isinstance(media_content, dict):
# First check if we can access the database
try:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT 1")
except sqlite3.DatabaseError as db_error:
logging.error(f"Database is corrupted or inaccessible: {str(db_error)}")
return conversation_id, "Database error: The database file appears to be corrupted. Please contact support."

# Now attempt the save
if not conversation_id:
# Only for new conversations, not updates
media_id = None
logging.debug(f"Media content keys: {media_content.keys()}")
if 'content' in media_content:
if isinstance(media_content, dict) and 'content' in media_content:
try:
content = media_content['content']
if isinstance(content, str):
content_json = json.loads(content)
elif isinstance(content, dict):
content_json = content
else:
raise ValueError(f"Unexpected content type: {type(content)}")

# Use the webpage_url as the media_id
content_json = content if isinstance(content, dict) else json.loads(content)
media_id = content_json.get('webpage_url')
# Use the title as the media_name
media_name = content_json.get('title')

logging.info(f"Extracted media_id: {media_id}, media_name: {media_name}")
except json.JSONDecodeError:
logging.error("Failed to decode JSON from media_content['content']")
except Exception as e:
logging.error(f"Error processing media_content: {str(e)}")
media_name = media_name or content_json.get('title', 'Unnamed Media')
except (json.JSONDecodeError, AttributeError) as e:
logging.error(f"Error processing media content: {str(e)}")
media_id = "unknown_media"
media_name = media_name or "Unnamed Media"
else:
logging.warning("'content' key not found in media_content")
else:
logging.warning(f"media_content is not a dictionary. Type: {type(media_content)}")

if media_id is None:
# If we couldn't find a media_id, we'll use a placeholder
media_id = "unknown_media"
logging.warning(f"Unable to extract media_id from media_content. Using placeholder: {media_id}")

if media_name is None:
media_name = "Unnamed Media"
logging.warning(f"Unable to extract media_name from media_content. Using placeholder: {media_name}")
media_id = "unknown_media"
media_name = media_name or "Unnamed Media"

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
conversation_title = f"{media_name}_{timestamp}"
conversation_id = start_new_conversation(title=conversation_title, media_id=media_id)
logging.info(f"Created new conversation with ID: {conversation_id}")

# For both new and existing conversations
try:
delete_messages_in_conversation(conversation_id)
for user_msg, assistant_msg in chatbot:
if user_msg:
save_message(conversation_id, "user", user_msg)
if assistant_msg:
save_message(conversation_id, "assistant", assistant_msg)
except sqlite3.DatabaseError as db_error:
logging.error(f"Database error during message save: {str(db_error)}")
return conversation_id, "Database error: Unable to save messages. Please try again or contact support."

# Generate a unique conversation name using media_id and current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
conversation_name = f"{media_name}_{timestamp}"

new_conversation_id = save_chat_history_to_database(chatbot, conversation_id, media_id, media_name,
conversation_name)
save_duration = time.time() - start_time
log_histogram("save_chat_history_to_db_duration", save_duration)
log_counter("save_chat_history_to_db_success")
return new_conversation_id, f"Chat history saved successfully as {conversation_name}!"

return conversation_id, "Chat history saved successfully!"

except Exception as e:
log_counter("save_chat_history_to_db_error", labels={"error": str(e)})
error_message = f"Failed to save chat history: {str(e)}"
Expand Down
Loading
Loading