Skip to content

Commit

Permalink
Merge pull request #14 from LyzrCore/change-pdf-parser
Browse files Browse the repository at this point in the history
changed pdf parser to Layout Pdf Parser
  • Loading branch information
patel-lyzr authored Feb 14, 2024
2 parents 750d21c + 9cee8cc commit 6002da0
Show file tree
Hide file tree
Showing 12 changed files with 146 additions and 122 deletions.
9 changes: 1 addition & 8 deletions build/lib/lyzr/base/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,20 @@ def from_defaults(
system_prompt: str = None,
query_wrapper_prompt: Union[str, BasePromptTemplate] = None,
**kwargs,
) -> ServiceContext:
) -> ServiceContext:
if isinstance(query_wrapper_prompt, str):
query_wrapper_prompt = PromptTemplate(template=query_wrapper_prompt)

callback_manager: CallbackManager = kwargs.get(
"callback_manager", CallbackManager()
)

node_parser = SimpleNodeParser.from_defaults(
chunk_size=750,
chunk_overlap=100,
callback_manager=callback_manager,
)

service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
callback_manager=callback_manager,
node_parser=node_parser,
**kwargs,
)

Expand Down
24 changes: 11 additions & 13 deletions build/lib/lyzr/utils/pdf_reader.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,29 @@
from typing import List

from langchain.document_loaders import PDFMinerLoader
from llmsherpa.readers import LayoutPDFReader
from llama_index.readers.base import BaseReader
from llama_index.schema import Document


class LyzrPDFReader(BaseReader):
def __init__(self) -> None:
try:
from pdfminer.high_level import extract_text
from llmsherpa.readers import LayoutPDFReader
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
"`llmsherpa` package not found, please install it with "
"`pip install llmsherpa`"
)

def load_data(self, file_path: str, extra_info: dict = None) -> List[Document]:
loader = PDFMinerLoader(str(file_path))
langchain_documents = loader.load()
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
loader = LayoutPDFReader(llmsherpa_api_url)

doc = loader.read_pdf(str(file_path))
metadata = {"source": str(file_path)}
documents = []
for langchain_document in langchain_documents:
doc = Document.from_langchain_format(langchain_document)

if extra_info is not None:
doc.metadata.update(extra_info)

documents.append(doc)
for chunk in doc.chunks():
document = Document(text=chunk.to_context_text(), metadata=metadata)
documents.append(document)

return documents
84 changes: 54 additions & 30 deletions build/lib/lyzr/utils/rag_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,18 @@ def pdf_rag(
)

vector_store_index = LyzrVectorStoreIndex.from_defaults(
**vector_store_params, documents=documents, service_context=service_context
**vector_store_params,
documents=documents,
service_context=service_context,
similarity_top_k=10,
)

retriever = LyzrRetriever.from_defaults(
**retriever_params, base_index=vector_store_index
)
# retriever = LyzrRetriever.from_defaults(
# **retriever_params, base_index=vector_store_index
# )

query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
# query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
query_engine = vector_store_index.as_query_engine(similarity_top_k=10)

return query_engine

Expand Down Expand Up @@ -136,14 +140,18 @@ def txt_rag(
)

vector_store_index = LyzrVectorStoreIndex.from_defaults(
**vector_store_params, documents=documents, service_context=service_context
**vector_store_params,
documents=documents,
service_context=service_context,
similarity_top_k=10,
)

retriever = LyzrRetriever.from_defaults(
**retriever_params, base_index=vector_store_index
)
# retriever = LyzrRetriever.from_defaults(
# **retriever_params, base_index=vector_store_index
# )

query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
# query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
query_engine = vector_store_index.as_query_engine(similarity_top_k=10)

return query_engine

Expand Down Expand Up @@ -201,14 +209,18 @@ def docx_rag(
)

vector_store_index = LyzrVectorStoreIndex.from_defaults(
**vector_store_params, documents=documents, service_context=service_context
**vector_store_params,
documents=documents,
service_context=service_context,
similarity_top_k=10,
)

retriever = LyzrRetriever.from_defaults(
**retriever_params, base_index=vector_store_index
)
# retriever = LyzrRetriever.from_defaults(
# **retriever_params, base_index=vector_store_index
# )

query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
# query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
query_engine = vector_store_index.as_query_engine(similarity_top_k=10)

return query_engine

Expand Down Expand Up @@ -256,14 +268,18 @@ def webpage_rag(
)

vector_store_index = LyzrVectorStoreIndex.from_defaults(
**vector_store_params, documents=documents, service_context=service_context
**vector_store_params,
documents=documents,
service_context=service_context,
similarity_top_k=10,
)

retriever = LyzrRetriever.from_defaults(
**retriever_params, base_index=vector_store_index
)
# retriever = LyzrRetriever.from_defaults(
# **retriever_params, base_index=vector_store_index
# )

query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
# query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
query_engine = vector_store_index.as_query_engine(similarity_top_k=10)

return query_engine

Expand Down Expand Up @@ -311,14 +327,18 @@ def website_rag(
)

vector_store_index = LyzrVectorStoreIndex.from_defaults(
**vector_store_params, documents=documents, service_context=service_context
**vector_store_params,
documents=documents,
service_context=service_context,
similarity_top_k=10,
)

retriever = LyzrRetriever.from_defaults(
**retriever_params, base_index=vector_store_index
)
# retriever = LyzrRetriever.from_defaults(
# **retriever_params, base_index=vector_store_index
# )

query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
# query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
query_engine = vector_store_index.as_query_engine(similarity_top_k=10)

return query_engine

Expand Down Expand Up @@ -366,13 +386,17 @@ def youtube_rag(
)

vector_store_index = LyzrVectorStoreIndex.from_defaults(
**vector_store_params, documents=documents, service_context=service_context
**vector_store_params,
documents=documents,
service_context=service_context,
similarity_top_k=10,
)

retriever = LyzrRetriever.from_defaults(
**retriever_params, base_index=vector_store_index
)
# retriever = LyzrRetriever.from_defaults(
# **retriever_params, base_index=vector_store_index
# )

query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
# query_engine = RetrieverQueryEngine.from_args(retriever, query_engine_params)
query_engine = vector_store_index.as_query_engine(similarity_top_k=10)

return query_engine
Binary file removed dist/lyzr-0.1.23.tar.gz
Binary file not shown.
Binary file not shown.
Binary file added dist/lyzr-0.1.24.tar.gz
Binary file not shown.
20 changes: 7 additions & 13 deletions lyzr.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,25 +1,17 @@
Metadata-Version: 2.1
Name: lyzr
Version: 0.1.23
Home-page:
Version: 0.1.24
Summary: UNKNOWN
Home-page: UNKNOWN
Author: lyzr
License: UNKNOWN
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Requires-Python: >=3.8.1, <3.12
Description-Content-Type: text/markdown
License-File: LICENSE.txt
Requires-Dist: asyncio
Requires-Dist: nest_asyncio
Requires-Dist: openai==1.3.4
Requires-Dist: litellm==1.2.0
Requires-Dist: llama-index==0.9.4
Requires-Dist: langchain==0.0.339
Requires-Dist: python-dotenv>=1.0.0
Requires-Dist: beautifulsoup4==4.12.2
Requires-Dist: pandas==2.0.2
Requires-Dist: matplotlib==3.8.2
Requires-Dist: weaviate-client==3.25.3

# lyzr

Expand Down Expand Up @@ -80,3 +72,5 @@ Replace `[version]` with the actual version of the package you have built.
## License

`lyzr` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license.


12 changes: 6 additions & 6 deletions lyzr.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
asyncio
nest_asyncio
openai==1.3.4
beautifulsoup4==4.12.2
langchain==0.0.339
litellm==1.2.0
llama-index==0.9.4
langchain==0.0.339
python-dotenv>=1.0.0
beautifulsoup4==4.12.2
pandas==2.0.2
matplotlib==3.8.2
nest_asyncio
openai==1.3.4
pandas==2.0.2
python-dotenv>=1.0.0
weaviate-client==3.25.3
9 changes: 1 addition & 8 deletions lyzr/base/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,20 @@ def from_defaults(
system_prompt: str = None,
query_wrapper_prompt: Union[str, BasePromptTemplate] = None,
**kwargs,
) -> ServiceContext:
) -> ServiceContext:
if isinstance(query_wrapper_prompt, str):
query_wrapper_prompt = PromptTemplate(template=query_wrapper_prompt)

callback_manager: CallbackManager = kwargs.get(
"callback_manager", CallbackManager()
)

node_parser = SimpleNodeParser.from_defaults(
chunk_size=750,
chunk_overlap=100,
callback_manager=callback_manager,
)

service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
callback_manager=callback_manager,
node_parser=node_parser,
**kwargs,
)

Expand Down
24 changes: 11 additions & 13 deletions lyzr/utils/pdf_reader.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,29 @@
from typing import List

from langchain.document_loaders import PDFMinerLoader
from llmsherpa.readers import LayoutPDFReader
from llama_index.readers.base import BaseReader
from llama_index.schema import Document


class LyzrPDFReader(BaseReader):
def __init__(self) -> None:
try:
from pdfminer.high_level import extract_text
from llmsherpa.readers import LayoutPDFReader
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
"`llmsherpa` package not found, please install it with "
"`pip install llmsherpa`"
)

def load_data(self, file_path: str, extra_info: dict = None) -> List[Document]:
loader = PDFMinerLoader(str(file_path))
langchain_documents = loader.load()
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
loader = LayoutPDFReader(llmsherpa_api_url)

doc = loader.read_pdf(str(file_path))
metadata = {"source": str(file_path)}
documents = []
for langchain_document in langchain_documents:
doc = Document.from_langchain_format(langchain_document)

if extra_info is not None:
doc.metadata.update(extra_info)

documents.append(doc)
for chunk in doc.chunks():
document = Document(text=chunk.to_context_text(), metadata=metadata)
documents.append(document)

return documents
Loading

0 comments on commit 6002da0

Please sign in to comment.