diff --git a/brevia/index.py b/brevia/index.py index 6ec4ceb..43c9af7 100644 --- a/brevia/index.py +++ b/brevia/index.py @@ -1,6 +1,8 @@ """Index document with embeddings in vector database.""" +from functools import lru_cache from os import path from logging import getLogger +from warnings import warn from langchain_community.vectorstores.pgembedding import CollectionStore from langchain_community.vectorstores.pgembedding import EmbeddingStore from langchain_community.vectorstores.pgvector import PGVector @@ -19,9 +21,16 @@ def init_index(): """Init index data""" + warn("init_index deprecated, use init_splitting_data instead", DeprecationWarning) + init_splitting_data() + + +@lru_cache +def init_splitting_data() -> bool: + """Init splitting tools data (NLTK for now)""" try: import nltk # pylint: disable=import-outside-toplevel - nltk.download('punkt') + return nltk.download('punkt') except ImportError as exc: raise ImportError( @@ -92,6 +101,7 @@ def split_document( document: Document, collection_meta: dict = {} ) -> list[Document]: """ Split document into text chunks and return a list of documents""" + init_splitting_data() text_splitter = create_splitter(collection_meta) texts = text_splitter.split_documents([document]) counter = 1 diff --git a/brevia/routers/app_routers.py b/brevia/routers/app_routers.py index 56a6da5..931b65a 100644 --- a/brevia/routers/app_routers.py +++ b/brevia/routers/app_routers.py @@ -1,6 +1,5 @@ """Add brevia app routers.""" from fastapi import FastAPI -from brevia import index from brevia.routers import ( analyze_router, index_router, @@ -25,5 +24,3 @@ def add_routers(app: FastAPI) -> None: app.include_router(qa_router.router) app.include_router(status_router.router) app.include_router(completion_router.router) - - index.init_index() diff --git a/tests/conftest.py b/tests/conftest.py index 006ddef..1079f0e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,13 +4,13 @@ from alembic import command from alembic.config import Config from dotenv import dotenv_values -from brevia.index import init_index +from brevia.index import init_splitting_data from brevia.settings import get_settings def pytest_sessionstart(session): """Init index data, just once""" - return init_index() + return init_splitting_data() def update_settings():