From 09bd38e8bc72dd21fa1f3f9bfed77155bfffe0ee Mon Sep 17 00:00:00 2001 From: Robert Date: Sat, 4 Jan 2025 23:20:03 -0800 Subject: [PATCH 1/3] sorting --- Docs/Design/Audio_Pipeline.md | 1 + Docs/Design/Education.md | 2 +- Docs/Design/Podcast.md | 21 +++ Docs/Design/TTS_STT.md | 312 +++++++++++++++------------------- Docs/Design/UX.md | 3 + Docs/Design/VLMs.md | 2 + 6 files changed, 168 insertions(+), 173 deletions(-) create mode 100644 Docs/Design/Podcast.md diff --git a/Docs/Design/Audio_Pipeline.md b/Docs/Design/Audio_Pipeline.md index 2cf8e90b..3b61bf97 100644 --- a/Docs/Design/Audio_Pipeline.md +++ b/Docs/Design/Audio_Pipeline.md @@ -26,6 +26,7 @@ https://pubs.aip.org/asa/jel/article/4/2/025206/3267247/Evaluating-OpenAI-s-Whis Transcription: https://github.com/AugmendTech/treeseg https://www.arxiv.org/abs/2407.12028 +https://github.com/Purfview/whisper-standalone-win https://huggingface.co/spaces/aadnk/faster-whisper-webui https://huggingface.co/spaces/zhang082799/openai-whisper-large-v3-turbo https://petewarden.com/2024/10/21/introducing-moonshine-the-new-state-of-the-art-for-speech-to-text/ diff --git a/Docs/Design/Education.md b/Docs/Design/Education.md index 66ae8339..da79250f 100644 --- a/Docs/Design/Education.md +++ b/Docs/Design/Education.md @@ -13,7 +13,7 @@ https://news.ycombinator.com/item?id=42534931 https://ankiweb.net/shared/info/1531888719 https://bbycroft.net/llm https://github.com/met4citizen/TalkingHead - +https://github.com/Rolandjg/skool4free one2manny diff --git a/Docs/Design/Podcast.md b/Docs/Design/Podcast.md new file mode 100644 index 00000000..a44f35d5 --- /dev/null +++ b/Docs/Design/Podcast.md @@ -0,0 +1,21 @@ +# Podcasts + + +## Introduction + + +### Link Dump: +https://github.com/FanaHOVA/smol-podcaster +https://huggingface.co/spaces/saq1b/podcastgen/blob/main/app.py +https://huggingface.co/spaces/mozilla-ai/document-to-podcast/blob/main/app.py +https://github.com/aedocw/epub2tts +https://github.com/lamm-mit/PDF2Audio +https://huggingface.co/spaces/bencser/episodegen +https://huggingface.co/spaces/lamm-mit/PDF2Audio +https://github.com/souzatharsis/podcastfy +https://github.com/agituts/gemini-2-podcast +https://github.com/meta-llama/llama-recipes/tree/main/recipes%2Fquickstart%2FNotebookLlama +https://github.com/JarodMica/audiobook_maker + + + diff --git a/Docs/Design/TTS_STT.md b/Docs/Design/TTS_STT.md index 7ac47c0c..f7169033 100644 --- a/Docs/Design/TTS_STT.md +++ b/Docs/Design/TTS_STT.md @@ -24,143 +24,130 @@ Flow: https://huggingface.co/blog/big-bench-audio-release https://huggingface.co/datasets/ArtificialAnalysis/big_bench_audio https://artificialanalysis.ai/models/speech-to-speech +https://github.com/Picovoice/speech-to-text-benchmark?tab=readme-ov-file +https://sakshi113.github.io/mmau_homepage/ - - +### Other +https://wave-pulse.io/ ### Link Dump: -https://github.com/albirrkarim/react-speech-highlight-demo -https://funaudiollm.github.io/cosyvoice2/ -https://funaudiollm.github.io/cosyvoice2/ -https://github.com/InternLM/InternLM-XComposer/tree/main/InternLM-XComposer-2.5-OmniLive -https://github.com/Azure-Samples/aisearch-openai-rag-audio -https://www.reddit.com/r/LocalLLaMA/comments/1f0awd6/best_local_open_source_texttospeech_and/ -https://github.com/FanaHOVA/smol-podcaster -https://docs.inferless.com/cookbook/serverless-customer-service-bot -https://wave-pulse.io/ -https://huggingface.co/spaces/saq1b/podcastgen/blob/main/app.py -https://huggingface.co/spaces/mozilla-ai/document-to-podcast/blob/main/app.py -https://huggingface.co/spaces/Nymbo/Voice-Clone-Multilingual/tree/main -https://github.com/aedocw/epub2tts -https://github.com/microsoft/SpeechT5 -https://www.lightnote.co/?utm_source=www.hivefive.community&utm_medium=newsletter&utm_campaign=hive-five-202-a-bias-to-action -https://github.com/smellslikeml/dolla_llama -https://github.com/dnhkng/GlaDOS -https://arxiv.org/abs/2501.01384 -https://sharechatx.github.io/ -https://github.com/vipchengrui/traditional-speech-enhancement -https://github.com/ictnlp/LLaMA-Omni -https://github.com/lamm-mit/PDF2Audio +https://huggingface.co/NexaAIDev/Qwen2-Audio-7B-GGUF +https://github.com/shagunmistry/NotebookLM_Alternative/tree/main/ai_helper +https://docs.cartesia.ai/get-started/make-an-api-request +https://arxiv.org/abs/2412.18566 +https://github.com/pipecat-ai/pipecat/tree/a367a038f1a3967292b5de5b43b8600a82a73fb6?tab=readme-ov-file https://github.com/Purfview/whisper-standalone-win -https://github.com/livekit/agents -https://huggingface.co/papers/2410.02678 -https://github.com/Picovoice/speech-to-text-benchmark -https://huggingface.co/spaces/bencser/episodegen +https://github.com/ictnlp/LLaMA-Omni +https://levelup.gitconnected.com/build-a-real-time-ai-voice-and-video-chat-app-with-function-calling-by-gemini-2-0-49599a48fbe9?gi=c894f6c092be +Blogposts + https://blog.duy.dev/build-your-own-voice-assistant-and-run-it-locally/ + https://www.twilio.com/en-us/blog/twilio-openai-realtime-api-launch-integration -STT - https://github.com/KoljaB/RealtimeSTT - https://github.com/southbridgeai/offmute - MoonShine - https://github.com/usefulsensors/moonshine - https://github.com/huggingface/transformers.js-examples/tree/main/moonshine-web - https://huggingface.co/onnx-community/moonshine-base-ONNX +Full Pipelines + https://github.com/lhl/voicechat2?tab=readme-ov-file + https://github.com/eustlb/speech-to-speech + https://github.com/dnhkng/GlaDOS + https://github.com/mezbaul-h/june + https://github.com/matatonic/openedai-speech + https://github.com/pixelpump/Ai-Interview-Assistant-Python + https://github.com/huggingface/speech-to-speech + https://github.com/harvestingmoon/S2S + https://github.com/livekit/agents -TTS - https://github.com/KoljaB/RealtimeTTS - https://si.inc/hertz-dev/ -101 - https://www.inferless.com/learn/comparing-different-text-to-speech---tts--models-for-different-use-cases - https://clideo.com/resources/what-is-tts - https://pub.towardsai.net/the-ultimate-guide-to-audio-processing-principles-techniques-and-applications-7724efea00e8 - RVC 101 - https://gudgud96.github.io/2024/09/26/annotated-rvc/ +Voicw2Voicw Models + https://github.com/Standard-Intelligence/hertz-dev + + +Editing Suites + https://github.com/abus-aikorea/voice-pro -Datasets(?) - https://voice-models.com/ -Auralis - https://github.com/astramind-ai/Auralis - https://www.astramind.ai/post/auralis +ElevenLabs + https://github.com/elevenlabs/elevenlabs-examples/blob/main/examples/text-to-speech/python/text_to_speech_file.py + https://elevenlabs.io/docs/api-reference/text-to-speech + https://elevenlabs.io/docs/developer-guides/how-to-use-tts-with-streaming +Alltalk +AlwaysReddy - (uses Piper) Amphion https://github.com/open-mmlab/Amphion https://huggingface.co/amphion/Vevo https://github.com/open-mmlab/Amphion/blob/main/models/vc/vevo/README.md https://openreview.net/pdf?id=anQDiQZhDP https://versavoice.github.io/ - +Auralis + https://github.com/astramind-ai/Auralis + https://www.astramind.ai/post/auralis Bark -https://github.com/suno-ai/bark - -ChatTTS -https://huggingface.co/2Noise/ChatTTS -https://chattts.com/#Demo - + https://github.com/suno-ai/bark +Cartesia + https://docs.cartesia.ai/get-started/make-an-api-request +Chat TTS + https://huggingface.co/2Noise/ChatTTS + https://chattts.com/#Demo + https://github.com/2noise/ChatTTS Coqui TTS https://github.com/idiap/coqui-ai-TTS https://huggingface.co/spaces/coqui/xtts/blob/main/app.py - -Cartesia - https://docs.cartesia.ai/get-started/make-an-api-request - + https://github.com/coqui-ai/TTS +CosyVoice2 + https://funaudiollm.github.io/cosyvoice2/ +Daswers XTTS GUI F5 TTS https://github.com/SWivid/F5-TTS - -lina TTS -https://github.com/theodorblackbird/lina-speech/blob/main/InferenceLina.ipynb -https://github.com/theodorblackbird/lina-speech - -Podcastfy - https://github.com/souzatharsis/podcastfy/blob/main/podcastfy/tts/base.py - https://github.com/souzatharsis/podcastfy/blob/main/podcastfy/text_to_speech.py - https://github.com/souzatharsis/podcastfy/blob/main/podcastfy/content_generator.py - -GLM-4-Voice - https://github.com/THUDM/GLM-4-Voice/blob/main/README_en.md - https://github.com/THUDM/GLM-4-Voice/tree/main - -MoonShine - https://huggingface.co/onnx-community/moonshine-base-ONNX - https://huggingface.co/spaces/webml-community/moonshine-web - https://github.com/huggingface/transformers.js-examples/tree/main/moonshine-web - +Fish-Speech + https://github.com/fishaudio/fish-speech/tree/main + https://github.com/fishaudio/fish-speech/blob/main/Start_Agent.md + https://huggingface.co/fishaudio/fish-agent-v0.1-3b/tree/main Gemini https://ai.google.dev/gemini-api/docs#rest https://ai.google.dev/gemini-api/docs/models/gemini-v2 https://github.com/google-gemini/cookbook/blob/main/quickstarts/Audio.ipynb - -ElevenLabs - https://github.com/elevenlabs/elevenlabs-examples/blob/main/examples/text-to-speech/python/text_to_speech_file.py - https://elevenlabs.io/docs/api-reference/text-to-speech - https://elevenlabs.io/docs/developer-guides/how-to-use-tts-with-streaming - -Models - https://huggingface.co/NexaAIDev/Qwen2-Audio-7B-GGUF - -Merging Audio - https://github.com/jiaaro/pydub - - - +Google + https://github.com/google-gemini/cookbook/tree/main/gemini-2 + https://discuss.ai.google.dev/t/how-does-one-get-access-to-the-api-for-tts-features-of-gemini-2-0/53925/15 + https://illuminate.google.com/home?pli=1 +GLM-4-Voice + https://github.com/THUDM/GLM-4-Voice/blob/main/README_en.md + https://github.com/THUDM/GLM-4-Voice/tree/main + https://huggingface.co/cydxg/glm-4-voice-9b-int4/blob/main/README_en.md +GPT-SoviTTS + https://github.com/cpumaxx/sovits-ff-plugin + https://github.com/JarodMica/GPT-SoVITS-Package +lina TTS + https://github.com/theodorblackbird/lina-speech/blob/main/InferenceLina.ipynb + https://github.com/theodorblackbird/lina-speech +LMNT MaskGCT https://maskgct.github.io/#emotion-samples https://github.com/open-mmlab/Amphion/blob/main/models/tts/maskgct/README.md https://github.com/open-mmlab/Amphion/blob/main/models/tts/maskgct/maskgct_demo.ipynb https://github.com/open-mmlab/Amphion/blob/main/models/tts/maskgct/maskgct_inference.py https://huggingface.co/amphion/MaskGCT - +MeloTTS + https://github.com/myshell-ai/MeloTTS Mimic https://github.com/MycroftAI/mimic3 - - +MoonShine + https://huggingface.co/onnx-community/moonshine-base-ONNX + https://huggingface.co/spaces/webml-community/moonshine-web + https://github.com/huggingface/transformers.js-examples/tree/main/moonshine-web +Neuro-sama + https://github.com/JarodMica/open-neruosama +Open-LLM-VTuber +OpenVoice + https://github.com/myshell-ai/OpenVoice +Outte + https://github.com/edwko/OuteTTS + https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF Parler https://github.com/huggingface/parler-tts - -Piper (linux only) +Paroli - Streaming mode implementation of the Piper TTS with RK3588 NPU acceleration support. +PiperTTS - A fast, local neural text to speech system that is optimized for the Raspberry Pi 4. https://github.com/rhasspy/piper https://github.com/rhasspy/piper/issues/644 https://github.com/rhasspy/piper/discussions/326#discussioncomment-7935208 @@ -170,57 +157,84 @@ Piper (linux only) https://huggingface.co/rhasspy/piper-voices/tree/main https://huggingface.co/datasets/rhasspy/piper-checkpoints/tree/main +PiperUI Sherpa ONNX https://github.com/k2-fsa/sherpa-onnx - +Silero +SpeechT5 + https://github.com/microsoft/SpeechT5 +SoundStorm + https://deepmind.google/discover/blog/pushing-the-frontiers-of-audio-generation/ + https://github.com/lucidrains/soundstorm-pytorch +Styletts2 +Tortoise TTS +VallE-X +VoiceCraft - +xtts +xtts2 +Yapper + https://github.com/n1teshy/yapper-tts YourTTS https://github.com/Edresson/YourTTS -TTS Pipeline - https://www.astramind.ai/post/auralis +https://docs.inferless.com/cookbook/serverless-customer-service-bot +https://github.com/dnhkng/GlaDOS -https://github.com/cpumaxx/sovits-ff-plugin +STT + https://github.com/KoljaB/RealtimeSTT + https://github.com/southbridgeai/offmute + https://github.com/flatmax/speech-to-text + https://github.com/collabora/WhisperLive + MoonShine + https://github.com/usefulsensors/moonshine + https://github.com/huggingface/transformers.js-examples/tree/main/moonshine-web + https://huggingface.co/onnx-community/moonshine-base-ONNX +TTS + https://github.com/KoljaB/RealtimeTTS + https://si.inc/hertz-dev/ -Train using: https://github.com/Mangio621/Mangio-RVC-Fork/releases, -import the .pth into https://huggingface.co/wok000/vcclient000/tree/main to convert your voice in near real time with about a .25s delay +101 + https://www.inferless.com/learn/comparing-different-text-to-speech---tts--models-for-different-use-cases + https://clideo.com/resources/what-is-tts + https://pub.towardsai.net/the-ultimate-guide-to-audio-processing-principles-techniques-and-applications-7724efea00e8 + RVC 101 + https://gudgud96.github.io/2024/09/26/annotated-rvc/ -https://www.hackster.io/lhl/voicechat2-local-ai-voice-chat-4c48f2 +Datasets(?) + https://voice-models.com/ -https://github.com/abus-aikorea/voice-pro -https://github.com/myshell-ai/MeloTTS -https://github.com/idiap/coqui-ai-TTS -https://docs.inferless.com/cookbook/serverless-customer-service-bot +Podcastfy + https://github.com/souzatharsis/podcastfy/blob/main/podcastfy/tts/base.py + https://github.com/souzatharsis/podcastfy/blob/main/podcastfy/text_to_speech.py + https://github.com/souzatharsis/podcastfy/blob/main/podcastfy/content_generator.py + +Models + https://huggingface.co/NexaAIDev/Qwen2-Audio-7B-GGUF + +Merging Audio + https://github.com/jiaaro/pydub + + + +TTS Pipeline + https://www.astramind.ai/post/auralis + +Train using: https://github.com/Mangio621/Mangio-RVC-Fork/releases, +import the .pth into https://huggingface.co/wok000/vcclient000/tree/main to convert your voice in near real time with about a .25s delay -https://huggingface.co/spaces/lamm-mit/PDF2Audio -https://huggingface.co/spaces/bencser/episodegen -https://github.com/myshell-ai/MeloTTS -https://github.com/idiap/coqui-ai-TTS -https://docs.inferless.com/cookbook/serverless-customer-service-bot -https://github.com/Picovoice/speech-to-text-benchmark Train using: https://github.com/Mangio621/Mangio-RVC-Fork/releases, import the .pth into https://huggingface.co/wok000/vcclient000/tree/main to convert your voice in near real time with about a .25s delay -https://www.hackster.io/lhl/voicechat2-local-ai-voice-chat-4c48f2 -https://huggingface.co/papers/2410.02678 -https://github.com/livekit/agents -https://github.com/pipecat-ai/pipecat/tree/a367a038f1a3967292b5de5b43b8600a82a73fb6?tab=readme-ov-file -https://github.com/lamm-mit/PDF2Audio -https://github.com/Purfview/whisper-standalone-win -https://github.com/ictnlp/LLaMA-Omni -https://levelup.gitconnected.com/build-a-real-time-ai-voice-and-video-chat-app-with-function-calling-by-gemini-2-0-49599a48fbe9?gi=c894f6c092be -https://github.com/agituts/gemini-2-podcast -https://github.com/SWivid/F5-TTS -https://github.com/matatonic/openedai-speech https://github.com/RVC-Boss/GPT-SoVITS https://www.bilibili.com/video/BV11iiNegEGP/ @@ -239,54 +253,8 @@ Only thing I changed was remove the space at the beginning of each lines in your And make sure you get the latest version https://github.com/RVC-Boss/GPT-SoVITS/releases -https://github.com/souzatharsis/podcastfy - -https://github.com/THUDM/GLM-4-Voice/tree/main - -https://huggingface.co/cydxg/glm-4-voice-9b-int4/blob/main/README_en.md -https://github.com/meta-llama/llama-recipes/tree/main/recipes%2Fquickstart%2FNotebookLlama - - -https://sakshi113.github.io/mmau_homepage/ -https://github.com/fishaudio/fish-speech/tree/main -https://github.com/fishaudio/fish-speech/blob/main/Start_Agent.md -https://huggingface.co/fishaudio/fish-agent-v0.1-3b/tree/main - -https://github.com/pixelpump/Ai-Interview-Assistant-Python -https://github.com/coqui-ai/TTS -https://github.com/Standard-Intelligence/hertz-dev -https://github.com/2noise/ChatTTS - -https://github.com/edwko/OuteTTS -https://huggingface.co/OuteAI/OuteTTS-0.2-500M-GGUF -https://huggingface.co/NexaAIDev/Qwen2-Audio-7B-GGUF - -https://www.twilio.com/en-us/blog/twilio-openai-realtime-api-launch-integration -https://github.com/huggingface/speech-to-speech -https://github.com/harvestingmoon/S2S -https://github.com/collabora/WhisperLive -https://github.com/JarodMica/audiobook_maker -https://github.com/myshell-ai/OpenVoice -https://github.com/JarodMica/GPT-SoVITS-Package -https://github.com/shagunmistry/NotebookLM_Alternative/tree/main/ai_helper -https://docs.cartesia.ai/get-started/make-an-api-request -https://github.com/JarodMica/open-neruosama -https://github.com/flatmax/speech-to-text -https://arxiv.org/abs/2412.18566 -https://github.com/Rolandjg/skool4free - - -SoundStorm - https://deepmind.google/discover/blog/pushing-the-frontiers-of-audio-generation/ - https://github.com/lucidrains/soundstorm-pytorch - - -Google -https://github.com/google-gemini/cookbook/tree/main/gemini-2 -https://discuss.ai.google.dev/t/how-does-one-get-access-to-the-api-for-tts-features-of-gemini-2-0/53925/15 -https://illuminate.google.com/home?pli=1 ``` import asyncio import base64 diff --git a/Docs/Design/UX.md b/Docs/Design/UX.md index 19b6e113..b6bc4b0c 100644 --- a/Docs/Design/UX.md +++ b/Docs/Design/UX.md @@ -98,3 +98,6 @@ Prompt Engineering page: - Looks like Claude Workshop UI window + +Website features +https://github.com/albirrkarim/react-speech-highlight-demo diff --git a/Docs/Design/VLMs.md b/Docs/Design/VLMs.md index 728ad1cb..56dbad7f 100644 --- a/Docs/Design/VLMs.md +++ b/Docs/Design/VLMs.md @@ -62,3 +62,5 @@ Apollo https://huggingface.co/GoodiesHere/Apollo-LMMs-Apollo-3B-t32 https://www.reddit.com/r/LocalLLaMA/comments/1hgri8g/has_apollo_disappeared/ +InternLM-XComposer + https://github.com/InternLM/InternLM-XComposer/tree/main/InternLM-XComposer-2.5-OmniLive \ No newline at end of file From 7196fd49501175b5c47cec0d1913833297910c3a Mon Sep 17 00:00:00 2001 From: Robert Date: Sun, 5 Jan 2025 09:14:32 -0800 Subject: [PATCH 2/3] f --- Docs/Design/Researcher.md | 2 +- Docs/Design/TTS_STT.md | 2 +- Docs/Design/VLMs.md | 2 +- Docs/RAG_Notes.md | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Docs/Design/Researcher.md b/Docs/Design/Researcher.md index c4203c00..bb7b4a95 100644 --- a/Docs/Design/Researcher.md +++ b/Docs/Design/Researcher.md @@ -144,7 +144,7 @@ https://www.researchrabbit.ai/ https://github.com/faraz18001/Sales-Llama https://github.com/memgraph/memgraph https://github.com/rashadphz/farfalle/tree/main/src/backend - +https://github.com/SakanaAI/AI-Scientist https://github.com/rashadphz/farfalle/blob/main/src/backend/agent_search.py https://github.com/rashadphz/farfalle/blob/main/src/backend/prompts.py https://github.com/stanford-oval/storm/ diff --git a/Docs/Design/TTS_STT.md b/Docs/Design/TTS_STT.md index 7ac47c0c..d43ba210 100644 --- a/Docs/Design/TTS_STT.md +++ b/Docs/Design/TTS_STT.md @@ -181,7 +181,7 @@ TTS Pipeline https://www.astramind.ai/post/auralis https://github.com/cpumaxx/sovits-ff-plugin - +https://github.com/satvik314/opensource_notebooklm/blob/main/opensource_notebooklm.ipynb Train using: https://github.com/Mangio621/Mangio-RVC-Fork/releases, diff --git a/Docs/Design/VLMs.md b/Docs/Design/VLMs.md index 728ad1cb..7965f1b4 100644 --- a/Docs/Design/VLMs.md +++ b/Docs/Design/VLMs.md @@ -9,7 +9,7 @@ https://colab.research.google.com/drive/1wkCIO6q8UDJQbPsu8jI_og1JAUibpp38?usp=sh https://colab.research.google.com/drive/1bcSu_mLki11aXpbS6Fwo_WF5idJv8uId?usp=sharing https://colab.research.google.com/drive/1E-ySj39oldXcvcsbjDYnvW-USi281Llj?usp=sharing - +https://github.com/pliang279/awesome-multimodal-ml https://arxiv.org/abs/2411.18279 https://github.com/breezedeus/Pix2Text https://huggingface.co/HuggingFaceTB/SmolVLM-Instruct diff --git a/Docs/RAG_Notes.md b/Docs/RAG_Notes.md index fadf7a97..0ad31457 100644 --- a/Docs/RAG_Notes.md +++ b/Docs/RAG_Notes.md @@ -47,7 +47,11 @@ https://arxiv.org/abs/2412.15563 https://arxiv.org/pdf/2410.10293v1 https://arxiv.org/pdf/2409.02098v1 https://github.com/OpenSPG/KAG +https://generativeai.pub/graph-rag-has-awesome-potential-but-currently-has-serious-flaws-c052a8a3107e +https://pub.towardsai.net/kag-graph-multimodal-rag-llm-agents-powerful-ai-reasoning-b3da38d31358 +https://arxiv.org/abs/2412.19442 +https://arxiv.org/abs/2412.15605 GraphRAG https://www.microsoft.com/en-us/research/blog/introducing-drift-search-combining-global-and-local-search-methods-to-improve-quality-and-efficiency/ From c88c0df4a972ceb179864bd3f9352c08d8abe23a Mon Sep 17 00:00:00 2001 From: Robert Date: Sun, 5 Jan 2025 20:49:35 -0800 Subject: [PATCH 3/3] I'm an idiot. --- App_Function_Libraries/Chat/Chat_Functions.py | 11 +++----- App_Function_Libraries/Gradio_UI/Chat_ui.py | 11 +++++--- .../Gradio_UI/Workflows_tab.py | 28 +++++++++++-------- Docs/Design/DB_Design.md | 1 + Docs/Design/Diagram_Generation.md | 2 +- Docs/Design/Prompts.md | 2 +- Docs/Design/Structured_Outputs.md | 2 +- Docs/Design/TTS_STT.md | 3 ++ Docs/Design/Text2SQL.md | 2 +- Docs/Design/UX.md | 3 ++ Docs/RAG_Notes.md | 13 +++++++-- README.md | 1 + 12 files changed, 50 insertions(+), 29 deletions(-) diff --git a/App_Function_Libraries/Chat/Chat_Functions.py b/App_Function_Libraries/Chat/Chat_Functions.py index aeea3107..4afbfc2d 100644 --- a/App_Function_Libraries/Chat/Chat_Functions.py +++ b/App_Function_Libraries/Chat/Chat_Functions.py @@ -49,12 +49,9 @@ def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_messag logging.info(f"Debug - Chat API Call - API Endpoint: {api_endpoint}") log_counter("chat_api_call_attempt", labels={"api_endpoint": api_endpoint}) start_time = time.time() - if not api_key: - api_key = None - model = None try: logging.info(f"Debug - Chat API Call - API Endpoint: {api_endpoint}") - logging.info(f"Debug - Chat API Call - API Key: {api_key}") + logging.info(f"Debug - Chat API Call - API Key: {api_key[:5]}...{api_key[-5:]}") logging.info(f"Debug - Chat chat_api_call - API Endpoint: {api_endpoint}") if api_endpoint.lower() == 'openai': response = chat_with_openai(api_key, input_data, prompt, temp, system_message, streaming, minp, maxp, model) @@ -143,7 +140,7 @@ def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_messag return f"An error occurred: {str(e)}" -def chat(message, history, media_content, selected_parts, api_endpoint, api_key, prompt, temperature, +def chat(message, history, media_content, selected_parts, api_endpoint, api_key, custom_prompt, temperature, system_message=None, streaming=False, minp=None, maxp=None, model=None): log_counter("chat_attempt", labels={"api_endpoint": api_endpoint}) start_time = time.time() @@ -180,10 +177,10 @@ def chat(message, history, media_content, selected_parts, api_endpoint, api_key, logging.debug(f"Debug - Chat Function - Temperature: {temperature}") logging.debug(f"Debug - Chat Function - API Key: {api_key[:10]}") - logging.debug(f"Debug - Chat Function - Prompt: {prompt}") + logging.debug(f"Debug - Chat Function - Prompt: {custom_prompt}") # Use the existing API request code based on the selected endpoint - response = chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_message, streaming, minp=None, maxp=None, model=None) + response = chat_api_call(api_endpoint, api_key, input_data, custom_prompt, temp, system_message, streaming, minp, maxp, model) if streaming: return response diff --git a/App_Function_Libraries/Gradio_UI/Chat_ui.py b/App_Function_Libraries/Gradio_UI/Chat_ui.py index f728ab34..b4447516 100644 --- a/App_Function_Libraries/Gradio_UI/Chat_ui.py +++ b/App_Function_Libraries/Gradio_UI/Chat_ui.py @@ -93,7 +93,9 @@ def chat_wrapper(message, history, media_content, selected_parts, api_endpoint, presence_penalty=None, stop_sequence=None): try: if save_conversation: + logging.info("chat_wrapper(): Saving conversation") if conversation_id is None: + logging.info("chat_wrapper(): Creating a new conversation") # Create a new conversation media_id = media_content.get('id', None) conversation_name = f"Chat about {media_content.get('title', 'Unknown Media')} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" @@ -111,11 +113,12 @@ def chat_wrapper(message, history, media_content, selected_parts, api_endpoint, full_message = message # Generate bot response + logging.debug("chat_wrapper(): Generating bot response") bot_message = "" for chunk in chat(full_message, history, media_content, selected_parts, api_endpoint, api_key, custom_prompt, - temperature, system_prompt, streaming): + temperature, system_prompt, streaming, minp=None, maxp=None, model=None): bot_message += chunk # Accumulate the streamed response - logging.debug(f"Bot message being returned: {bot_message}") + logging.debug(f"chat_wrapper(): Bot message being returned: {bot_message}") # Yield the incremental response and updated history yield bot_message, history + [(message, bot_message)], conversation_id @@ -124,8 +127,8 @@ def chat_wrapper(message, history, media_content, selected_parts, api_endpoint, save_message(conversation_id, role="assistant", content=bot_message) except Exception as e: - logging.error(f"Error in chat wrapper: {str(e)}") - yield "An error occurred.", history, conversation_id + logging.error(f"chat_wrapper(): Error in chat wrapper: {str(e)}") + yield "chat_wrapper(): An error occurred.", history, conversation_id def search_conversations(query): diff --git a/App_Function_Libraries/Gradio_UI/Workflows_tab.py b/App_Function_Libraries/Gradio_UI/Workflows_tab.py index 05530af2..1689b837 100644 --- a/App_Function_Libraries/Gradio_UI/Workflows_tab.py +++ b/App_Function_Libraries/Gradio_UI/Workflows_tab.py @@ -47,7 +47,6 @@ def chat_workflows_tab(): with gr.Row(): with gr.Column(): workflow_selector = gr.Dropdown(label="Select Workflow", choices=[wf['name'] for wf in workflows]) - # Refactored API selection dropdown api_selector = gr.Dropdown( choices=["None"] + [format_api_name(api) for api in global_api_endpoints], value=default_value, @@ -91,8 +90,7 @@ def update_workflow_ui(workflow_name): logging.error(f"Selected workflow not found: {workflow_name}") return {"current_step": 0, "max_steps": 0, "conversation_id": None}, "", [] - def process_workflow_step(message, history, context, workflow_name, api_endpoint, api_key, workflow_state, - save_conv, temp): + def process_workflow_step(message, history, context, workflow_name, api_endpoint, api_key, workflow_state, save_conv, temp): logging.info(f"Process workflow step called with message: {message}") logging.info(f"Current workflow state: {workflow_state}") try: @@ -114,13 +112,20 @@ def process_workflow_step(message, history, context, workflow_name, api_endpoint full_message = f"{context}\n\nStep {current_step + 1}: {prompt}\nUser: {message}" logging.info(f"Calling chat_wrapper with full_message: {full_message[:100]}...") - bot_message, new_history, new_conversation_id = chat_wrapper( + + # Initialize bot message to accumulate stremed response + bot_message = "" + + # call chat _wrapper + for chunk, new_history, new_conversation_id in chat_wrapper( full_message, history, media_content.value, selected_parts.value, api_endpoint, api_key, "", workflow_state["conversation_id"], save_conv, temp, "You are a helpful assistant guiding through a workflow." - ) + ): + bot_message = chunk # Update bot message with the latest chunk + yield new_history, workflow_state, gr.update(interactive=True) + logging.info(f"Received bot_message: {bot_message[:50]}...") - logging.info(f"Received bot_message: {bot_message[:100]}...") next_step = current_step + 1 new_workflow_state = { @@ -131,15 +136,17 @@ def process_workflow_step(message, history, context, workflow_name, api_endpoint if next_step >= max_steps: logging.info("Workflow completed after this step") - return new_history, new_workflow_state, gr.update(interactive=False) + yield history + [(message, bot_message)], new_workflow_state, gr.update(interactive=False) else: next_prompt = selected_workflow['prompts'][next_step] + new_history = history + [(message, bot_message)] new_history.append((None, f"Step {next_step + 1}: {next_prompt}")) logging.info(f"Moving to next step: {next_step}") - return new_history, new_workflow_state, gr.update(interactive=True) + yield new_history, new_workflow_state, gr.update(interactive=True) + except Exception as e: logging.error(f"Error in process_workflow_step: {str(e)}") - return history, workflow_state, gr.update(interactive=True) + yield history, workflow_state, gr.update(interactive=True) workflow_selector.change( update_workflow_ui, @@ -149,8 +156,7 @@ def process_workflow_step(message, history, context, workflow_name, api_endpoint submit_btn.click( process_workflow_step, - inputs=[msg, chatbot, context_input, workflow_selector, api_selector, api_key_input, workflow_state, - save_conversation, temperature], + inputs=[msg, chatbot, context_input, workflow_selector, api_selector, api_key_input, workflow_state, save_conversation, temperature], outputs=[chatbot, workflow_state, msg] ).then( lambda: gr.update(value=""), diff --git a/Docs/Design/DB_Design.md b/Docs/Design/DB_Design.md index 27d536ef..aa4b1176 100644 --- a/Docs/Design/DB_Design.md +++ b/Docs/Design/DB_Design.md @@ -19,6 +19,7 @@ Migrating to sqlite-vec https://docs.google.com/document/d/1sJ_S2ggfFmtPJupxIO3C1EZAFuDMUfNYcAytissbFMs/edit?tab=t.0#heading=h.xyau1jyb6vyx https://github.com/Mozilla-Ocho/llamafile/pull/644 +https://ai.plainenglish.io/top-interview-questions-on-data-modeling-concepts-3d1587c86214 https://briandouglas.ie/sqlite-defaults/ https://phiresky.github.io/blog/2020/sqlite-performance-tuning/ https://kerkour.com/sqlite-for-servers diff --git a/Docs/Design/Diagram_Generation.md b/Docs/Design/Diagram_Generation.md index e71f72ad..bc92e256 100644 --- a/Docs/Design/Diagram_Generation.md +++ b/Docs/Design/Diagram_Generation.md @@ -5,7 +5,7 @@ https://excalidraw.com/ https://www.napkin.ai/ https://github.com/southbridgeai/diagen - +https://levelup.gitconnected.com/uml-diagrams-a-guide-for-software-engineers-71220ffb775f?source=home_following---------57-1--------------------dd5db0ec_9e4b_478a_951e_a16e50e4d723-------3 diff --git a/Docs/Design/Prompts.md b/Docs/Design/Prompts.md index a64f6f6b..af67e7b7 100644 --- a/Docs/Design/Prompts.md +++ b/Docs/Design/Prompts.md @@ -9,7 +9,7 @@ https://github.com/microsoft/PromptWizard https://medium.com/@camauger/crafting-effective-chatgpt-prompts-for-tabletop-roleplaying-games-a-step-by-step-guide-part-1-b81a791d278d - +https://towardsdatascience.com/how-i-won-singapores-gpt-4-prompt-engineering-competition-34c195a93d41 diff --git a/Docs/Design/Structured_Outputs.md b/Docs/Design/Structured_Outputs.md index 72d80306..bbcfbb83 100644 --- a/Docs/Design/Structured_Outputs.md +++ b/Docs/Design/Structured_Outputs.md @@ -2,7 +2,7 @@ https://towardsdatascience.com/diving-deeper-with-structured-outputs-b4a5d280c208 - +https://generativeai.pub/building-multi-agent-llm-systems-with-pydanticai-framework-a-step-by-step-guide-to-create-ai-5e41fbba2608 ## Introduction This page serves as documentation regarding the structured outputs within tldw and provides context/justification for the decisions made within the module. diff --git a/Docs/Design/TTS_STT.md b/Docs/Design/TTS_STT.md index dc0c1db8..5b23d5fd 100644 --- a/Docs/Design/TTS_STT.md +++ b/Docs/Design/TTS_STT.md @@ -118,6 +118,9 @@ GLM-4-Voice GPT-SoviTTS https://github.com/cpumaxx/sovits-ff-plugin https://github.com/JarodMica/GPT-SoVITS-Package +Kokoro + https://github.com/thewh1teagle/kokoro-onnx + https://huggingface.co/hexgrad/Kokoro-82M lina TTS https://github.com/theodorblackbird/lina-speech/blob/main/InferenceLina.ipynb https://github.com/theodorblackbird/lina-speech diff --git a/Docs/Design/Text2SQL.md b/Docs/Design/Text2SQL.md index 029ec72f..74ff4d20 100644 --- a/Docs/Design/Text2SQL.md +++ b/Docs/Design/Text2SQL.md @@ -19,5 +19,5 @@ https://github.com/TAG-Research/TAG-Bench https://arxiv.org/pdf/2407.14482 https://spider2-sql.github.io/ https://departmentofproduct.substack.com/p/how-to-write-sql-queries-using-ai?utm_medium=email&triedRedirect=true - +https://generativeai.pub/ai-innovations-and-insights-15-text2sql-and-extractous-2643981b197f diff --git a/Docs/Design/UX.md b/Docs/Design/UX.md index b6bc4b0c..3218917c 100644 --- a/Docs/Design/UX.md +++ b/Docs/Design/UX.md @@ -84,6 +84,9 @@ https://markwhen.com/ https://kando.menu/ https://deepseek-artifacts.vercel.app/ https://darkpatternsgame.productartistry.com/ +https://towardsdatascience.com/building-trust-in-llm-answers-highlighting-source-texts-in-pdfs-5d1342ecb811 +https://levelup.gitconnected.com/textual-how-this-python-framework-is-revolutionizing-ui-development-in-2025-7bfb0fd41a59 +https://cin-model-kotaemon.hf.space/app/ Not waifus, but clippy: diff --git a/Docs/RAG_Notes.md b/Docs/RAG_Notes.md index 0ad31457..33652789 100644 --- a/Docs/RAG_Notes.md +++ b/Docs/RAG_Notes.md @@ -49,9 +49,16 @@ https://arxiv.org/pdf/2409.02098v1 https://github.com/OpenSPG/KAG https://generativeai.pub/graph-rag-has-awesome-potential-but-currently-has-serious-flaws-c052a8a3107e https://pub.towardsai.net/kag-graph-multimodal-rag-llm-agents-powerful-ai-reasoning-b3da38d31358 - +https://medium.com/@florian_algo/ai-innovations-and-insights-14-funnelrag-and-craft-8436f97286bb https://arxiv.org/abs/2412.19442 https://arxiv.org/abs/2412.15605 +https://arxiv.org/html/2412.18069v1 +https://github.com/smallporridge/AssistRAG +https://medium.com/@samarrana407/why-knowledge-augmented-generation-kag-is-the-best-approach-to-rag-2e7820228087 + + + + GraphRAG https://www.microsoft.com/en-us/research/blog/introducing-drift-search-combining-global-and-local-search-methods-to-improve-quality-and-efficiency/ @@ -88,8 +95,8 @@ GraphRAG https://volodymyrpavlyshyn.medium.com/unified-knowledge-graph-model-rdf-rdf-vs-lpg-the-end-of-war-a7c14d6ac76f https://github.com/zjunlp/OneKE https://blog.gopenai.com/llm-ontology-prompting-for-knowledge-graph-extraction-efdcdd0db3a1?gi=1d8915f0da5e - - + https://towardsdatascience.com/how-to-build-a-graph-rag-app-b323fc33ba06 + https://medium.com/@dickson.lukose/ontology-modelling-and-engineering-4df8b6b9f3a5 ### Links diff --git a/README.md b/README.md index 240fe666..fd876f16 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ - This project started as a tool by `the-crypt-keeper` to perform summarization of YouTube videos. - I forked it, to add a couple features as I wanted to use it to help me consume conference videos at a faster pace. I kept adding/improving things and now it's a fully different tool/focus. - You can find the original scripts by `the-crypt-keeper` in the `tldw-original-scripts` directory, a snapshot of the files before I made my changes. +- The GUI is currently a placeholder. Please excuse its crappiness for now. It will be replaced with a FastAPI backend and a new JS frontend. (Gradio is a placeholder UI) ### Updating from a version installed prior to Nov 1st: