Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom default Retriever/TextSplitter, update deprecated OpenAI model #67

Merged
merged 17 commits into from
Sep 2, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: custom splitter from conf
  • Loading branch information
stefanorosanelli committed Aug 27, 2024
commit 8b985eb1d49fcd45fc9a9e5d825ff255a63303f9
49 changes: 32 additions & 17 deletions brevia/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
from langchain.vectorstores.pgvector import PGVector
from langchain_community.vectorstores.pgembedding import CollectionStore
from langchain_community.vectorstores.pgembedding import EmbeddingStore
from langchain_text_splitters.base import TextSplitter
from requests import HTTPError
from sqlalchemy.orm import Session
from brevia import connection, load_file
from brevia.collections import single_collection_by_name
from brevia.models import load_embeddings
from brevia.settings import get_settings
from brevia.utilities.json_api import query_data_pagination
from brevia.utilities.types import load_type


def init_index():
Expand Down Expand Up @@ -67,9 +69,13 @@ def add_document(
document_id: str = None,
) -> int:
""" Add document to index and return number of splitted text chunks"""
texts = split_document(document)
collection = single_collection_by_name(collection_name)
embed_conf = collection.cmetadata.get('embeddings', None) if collection else None
split_conf = collection.cmetadata.get('splitter', None) if collection else None

texts = split_document(document, split_conf)
PGVector.from_documents(
embedding=load_embeddings(collection_embeddings(collection_name)),
embedding=load_embeddings(embed_conf),
documents=texts,
collection_name=collection_name,
connection_string=connection.connection_string(),
Expand All @@ -80,29 +86,38 @@ def add_document(
return len(texts)


def collection_embeddings(collection_name: str) -> dict | None:
""" Return custom embeddings of a collection"""
collection = single_collection_by_name(collection_name)
if collection is None:
return None
def split_document(document: Document, split_conf: dict | None = None):
""" Split document into text chunks and return a list of documents"""
if not split_conf:
text_splitter = create_default_splitter()
else:
text_splitter = create_custom_splitter(split_conf)

return collection.cmetadata.get('embeddings', None)
texts = text_splitter.split_documents([document])
counter = 1
for text in texts:
text.metadata['part'] = counter
counter += 1
return texts


def split_document(document: Document):
""" Split document into text chunks and return a list of documents"""
def create_default_splitter() -> TextSplitter:
""" Create default text splitter"""
settings = get_settings()
text_splitter = NLTKTextSplitter(

return NLTKTextSplitter(
separator="\n",
chunk_size=settings.text_chunk_size,
chunk_overlap=settings.text_chunk_overlap
)
texts = text_splitter.split_documents([document])
counter = 1
for text in texts:
text.metadata['part'] = counter
counter += 1
return texts


def create_custom_splitter(split_conf: dict) -> TextSplitter:
""" Create custom text splitter"""
splitter_name = split_conf.pop('splitter', '')
splitter_class = load_type(splitter_name, TextSplitter)

return splitter_class(**split_conf)


def remove_document(
Expand Down
Loading