From 3fe4ec5f27b25c17e49b617c50f7fecd00de31da Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Fri, 10 Jan 2025 13:45:02 +0100 Subject: [PATCH] Fix remarques --- .../document_loaders/parsers/__init__.py | 10 +-- .../document_loaders/parsers/images.py | 45 +++++----- .../document_loaders/parsers/pdf.py | 90 +++++++++++++------ .../document_loaders/pdf.py | 18 ++-- .../parsers/test_pdf_parsers.py | 37 +++++--- .../document_loaders/test_images.py | 53 +++++------ .../document_loaders/test_pdf.py | 4 +- 7 files changed, 152 insertions(+), 105 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py index 790402dd3cf428..5f5a9f9d96f3cb 100644 --- a/libs/community/langchain_community/document_loaders/parsers/__init__.py +++ b/libs/community/langchain_community/document_loaders/parsers/__init__.py @@ -17,6 +17,11 @@ from langchain_community.document_loaders.parsers.html import ( BS4HTMLParser, ) + from langchain_community.document_loaders.parsers.images import ( + MultimodalBlobParser, + RapidOCRBlobParser, + TesseractBlobParser, + ) from langchain_community.document_loaders.parsers.language import ( LanguageParser, ) @@ -30,11 +35,6 @@ from langchain_community.document_loaders.parsers.vsdx import ( VsdxParser, ) - from langchain_community.document_loaders.parsers.images import ( - MultimodalBlobParser, - RapidOCRBlobParser, - TesseractBlobParser, - ) _module_lookup = { diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py index 496131fafd0d5f..f355566a5cdaff 100644 --- a/libs/community/langchain_community/document_loaders/parsers/images.py +++ b/libs/community/langchain_community/document_loaders/parsers/images.py @@ -3,24 +3,24 @@ import io import logging from abc import abstractmethod - -from PIL import Image from typing import Iterator, Literal -from langchain_community.document_loaders.base import BaseBlobParser -from langchain_community.document_loaders.blob_loaders import Blob from langchain_core.documents import Document from langchain_core.language_models import BaseChatModel from langchain_core.messages import HumanMessage +from PIL.Image import Image + +from langchain_community.document_loaders.base import BaseBlobParser +from langchain_community.document_loaders.blob_loaders import Blob logger = logging.getLogger(__name__) class ImageBlobParser(BaseBlobParser): def __init__( - self, - *, - format: Literal["text", "markdown", "html"] = "text", + self, + *, + format: Literal["text", "markdown", "html"] = "text", ): self.format = format @@ -47,9 +47,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: class RapidOCRBlobParser(ImageBlobParser): def __init__( - self, - *, - format: Literal["text", "markdown", "html"] = "text", + self, + *, + format: Literal["text", "markdown", "html"] = "text", ): super().__init__(format=format) self.ocr = None @@ -72,13 +72,11 @@ def _analyze_image(self, img: Image) -> str: class TesseractBlobParser(ImageBlobParser): - def __init__( - self, - *, - format: Literal["text", "markdown", "html"] = "text", - langs: list[str] = ["eng"], - + self, + *, + format: Literal["text", "markdown", "html"] = "text", + langs: list[str] = ["eng"], ): super().__init__(format=format) self.langs = langs @@ -99,18 +97,17 @@ def _analyze_image(self, img: Image) -> str: "images for retrieval. " "These summaries will be embedded and used to retrieve the raw image. " "Give a concise summary of the image that is well optimized for retrieval " - "and extract all the text from the image.") + "and extract all the text from the image." +) class MultimodalBlobParser(ImageBlobParser): - def __init__( - self, - *, - format: Literal["text", "markdown", "html"] = "text", - model: BaseChatModel, - prompt: str = _prompt_images_to_description, - + self, + *, + format: Literal["text", "markdown", "html"] = "text", + model: BaseChatModel, + prompt: str = _prompt_images_to_description, ): super().__init__(format=format) self.model = model diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 5933e3220737f3..d053528799ba19 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -2,18 +2,15 @@ from __future__ import annotations -import html import io import logging import threading import warnings from datetime import datetime -from urllib.parse import urlparse - -import numpy as np from typing import ( TYPE_CHECKING, Any, + Iterable, Iterator, Literal, Mapping, @@ -21,12 +18,17 @@ Sequence, Union, ) +from urllib.parse import urlparse + +import numpy as np +from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob -from langchain_community.document_loaders.parsers.images import ImageBlobParser, \ - RapidOCRBlobParser -from langchain_core.documents import Document +from langchain_community.document_loaders.parsers.images import ( + ImageBlobParser, + RapidOCRBlobParser, +) if TYPE_CHECKING: import pdfminer @@ -53,6 +55,38 @@ "JBIG2Decode", ] + +def extract_from_images_with_rapidocr( + images: Sequence[Union[Iterable[np.ndarray], bytes]], +) -> str: + """Extract text from images with RapidOCR. + + Args: + images: Images to extract text from. + + Returns: + Text extracted from images. + + Raises: + ImportError: If `rapidocr-onnxruntime` package is not installed. + """ + try: + from rapidocr_onnxruntime import RapidOCR + except ImportError: + raise ImportError( + "`rapidocr-onnxruntime` package not found, please install it with " + "`pip install rapidocr-onnxruntime`" + ) + ocr = RapidOCR() + text = "" + for img in images: + result, _ = ocr(img) + if result: + result = [text[1] for text in result] + text += "\n".join(result) + return text + + logger = logging.getLogger(__name__) _FORMAT_IMAGE_STR = "\n\n{image_text}\n\n" @@ -60,9 +94,10 @@ _JOIN_TABLES = "\n" _DEFAULT_PAGE_DELIMITOR = "\n\f" -_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"} +_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"} + -def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]: +def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]: """Validates the presence of at least the following keys: - source - page (if mode='page') @@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]: """ if not _STD_METADATA_KEYS.issubset(metadata.keys()): raise ValueError("The PDF parser must valorize the standard metadata.") - if not isinstance(metadata.get("page",0), int): + if not isinstance(metadata.get("page", 0), int): raise ValueError("The PDF metadata page must be a integer.") return metadata @@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]: return new_metadata -_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"] # To insert images or table in the middle of the page. +_PARAGRAPH_DELIMITOR = [ + "\n\n\n", + "\n\n", +] # To insert images or table in the middle of the page. def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str: @@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str: """ def _recurs_merge_text_and_extras( - extras: list[str], text_from_page: str, recurs: bool + extras: list[str], text_from_page: str, recurs: bool ) -> Optional[str]: if extras: for delim in _PARAGRAPH_DELIMITOR: @@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras( str_extras = "\n\n".join(filter(lambda x: x, extras)) if str_extras: all_extras = delim + str_extras - all_text = text_from_page[:pos] + all_extras + text_from_page[ - pos:] + all_text = ( + text_from_page[:pos] + all_extras + text_from_page[pos:] + ) break else: all_text = None @@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras( return all_text - class ImagesPdfParser(BaseBlobParser): """Abstract interface for blob parsers with images_to_text.""" @@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty ) def _extract_text_from_page(page: pypdf.PageObject) -> str: - """Extract text from image given the version of pypdf. - """ + """Extract text from image given the version of pypdf.""" if pypdf.__version__.startswith("3"): return page.extract_text() else: @@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty for page in doc: all_text = self._get_page_content(doc, page, blob).strip() if self.mode == "page": - yield Document( page_content=all_text, - metadata=_validate_metadata(doc_metadata | - {"page": page.number}), + metadata=_validate_metadata( + doc_metadata | {"page": page.number} + ), ) else: full_content.append(all_text) @@ -658,17 +695,16 @@ def _extract_images_from_page( if self.images_parser: xref = img[0] pix = pymupdf.Pixmap(doc, xref) - image=np.frombuffer(pix.samples, dtype=np.uint8).reshape( - pix.height, pix.width, -1 - ) + image = np.frombuffer(pix.samples, dtype=np.uint8).reshape( + pix.height, pix.width, -1 + ) image_bytes = io.BytesIO() Image.fromarray(image).save(image_bytes, format="PNG") - blob=Blob.from_data(image_bytes.getvalue(), mime_type="image/png") + blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png") images.append(next(self.images_parser.lazy_parse(blob)).page_content) return _FORMAT_IMAGE_STR.format( - image_text=_JOIN_IMAGES.join(filter(None,images)) - ) - + image_text=_JOIN_IMAGES.join(filter(None, images)) + ) def _extract_tables_from_page(self, page: pymupdf.Page) -> str: """Extract tables from a PDF page. diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index f39c02fe921f8a..4af3aefdb8895a 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -7,9 +7,6 @@ from abc import ABC from io import StringIO from pathlib import Path, PurePath -from urllib.parse import urlparse - -import requests from typing import ( TYPE_CHECKING, Any, @@ -22,13 +19,21 @@ Union, cast, ) +from urllib.parse import urlparse + +import requests +from langchain_core.documents import Document +from langchain_core.utils import get_from_dict_or_env from langchain_community.document_loaders.base import BaseLoader from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.dedoc import DedocBaseLoader -from langchain_community.document_loaders.parsers.images import ImageBlobParser, \ - RapidOCRBlobParser +from langchain_community.document_loaders.parsers.images import ( + ImageBlobParser, + RapidOCRBlobParser, +) from langchain_community.document_loaders.parsers.pdf import ( + _DEFAULT_PAGE_DELIMITOR, AmazonTextractPDFParser, DocumentIntelligenceParser, PDFMinerParser, @@ -36,11 +41,8 @@ PyMuPDFParser, PyPDFium2Parser, PyPDFParser, - _DEFAULT_PAGE_DELIMITOR, ) from langchain_community.document_loaders.unstructured import UnstructuredFileLoader -from langchain_core.documents import Document -from langchain_core.utils import get_from_dict_or_env if TYPE_CHECKING: from textractor.data.text_linearization_config import TextLinearizationConfig diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index 5ebbeda5642e7d..0fa312f741bbc7 100644 --- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -2,10 +2,9 @@ import re from pathlib import Path +from typing import Iterator -import numpy as np import pytest -from typing import Iterator, Type import langchain_community.document_loaders.parsers as pdf_parsers from langchain_community.document_loaders.base import BaseBlobParser @@ -14,7 +13,7 @@ PDFMinerParser, PDFPlumberParser, PyPDFium2Parser, - PyPDFParser, PyMuPDFParser, + PyPDFParser, ) # PDFs to test parsers on. @@ -135,13 +134,11 @@ def test_extract_images_text_from_pdf_pypdfium2parser() -> None: @pytest.mark.parametrize( "mode", - # ["single", "page"], - ["single"], # FIXME + ["single", "page"], ) @pytest.mark.parametrize( "extract_images", - # [True, False], - [True], # FIXME + [True, False], ) @pytest.mark.parametrize( "parser_factory,params", @@ -152,8 +149,15 @@ def test_extract_images_text_from_pdf_pypdfium2parser() -> None: def test_mode_and_extract_images_variations( parser_factory: str, params: dict, mode: str, extract_images: bool ) -> None: + """Apply the same test for all *standard* PDF parsers. + + - Try with mode `single` and `page` + - Try with extract_images `true` and `false` + """ + from PIL.Image import Image + from langchain_community.document_loaders.parsers.images import ImageBlobParser - from PIL import Image + def _std_assert_with_parser(parser: BaseBlobParser) -> None: """Standard tests to verify that the given parser works. @@ -221,15 +225,19 @@ def _analyze_image(self, img: Image) -> str: @pytest.mark.parametrize( "parser_class,params", [ - (PyMuPDFParser, {}), + ("PyMuPDFParser", {}), ], ) def test_parser_with_table( - parser_class: Type, + parser_factory: str, params: dict, mode: str, extract_tables: str, ) -> None: + from PIL.Image import Image + + from langchain_community.document_loaders.parsers.images import ImageBlobParser + def _std_assert_with_parser(parser: BaseBlobParser) -> None: """Standard tests to verify that the given parser works. @@ -271,15 +279,16 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None: else: assert not len(tables) + class EmptyImageBlobParser(ImageBlobParser): + def _analyze_image(self, img: Image) -> str: + return "![image](.)" - - def images_to_text(images: list[np.ndarray]) -> Iterator[str]: - return iter([""] * len(images)) + parser_class = getattr(pdf_parsers, parser_factory) parser = parser_class( mode=mode, extract_tables=extract_tables, - # images_to_text=images_to_text, # FIXME + images_parser=EmptyImageBlobParser(), **params, ) _std_assert_with_parser(parser) diff --git a/libs/community/tests/integration_tests/document_loaders/test_images.py b/libs/community/tests/integration_tests/document_loaders/test_images.py index f2acb1c2a9a91e..57ac6cd66e615d 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_images.py +++ b/libs/community/tests/integration_tests/document_loaders/test_images.py @@ -1,51 +1,54 @@ import re from pathlib import Path -from typing import Any +from typing import Any, Type import pytest - -from langchain_community.document_loaders.parsers.images import RapidOCRBlobParser, \ - _ImageBlobParser, TesseractBlobParser, MultimodalBlobParser from langchain_core.documents.base import Blob from langchain_openai import ChatOpenAI +from langchain_community.document_loaders.parsers.images import ( + MultimodalBlobParser, + RapidOCRBlobParser, + TesseractBlobParser, +) + building_image = Blob.from_path(Path(__file__).parent.parent / "examples/building.jpg") text_image = Blob.from_path(Path(__file__).parent.parent / "examples/text.png") @pytest.mark.parametrize( - 'blob,body', + "blob,body", [ (building_image, ""), - (text_image, r".*\bMAKE *TEXT\b.*\bSTAND\b.*\bOUT *FROM\b.*\bBACKGROUNDS\b.*") - ] + (text_image, r".*\bMAKE *TEXT\b.*\bSTAND\b.*\bOUT *FROM\b.*\bBACKGROUNDS\b.*"), + ], ) @pytest.mark.parametrize( "format,pattern", [ - ("text", r"(?sm)^{body}$"), - ("markdown", r"(?sm)^!\[{body}]\(\.\)|$"), - ("html", r'(?sm)^({body}|)'), + ("text", r"(?ism)^{body}$"), + ("markdown", r"(?ism)^!\[{body}]\(\.\)|$"), + ("html", r'(?ism)^({body}|)'), ], ) @pytest.mark.parametrize( "blob_loader,kw", [ - (RapidOCRBlobParser,{}), - (TesseractBlobParser,{}), - (MultimodalBlobParser,{"model":ChatOpenAI(model="gpt-4o", max_tokens=1024)}) - ] + (RapidOCRBlobParser, {}), + (TesseractBlobParser, {}), + (MultimodalBlobParser, {"model": ChatOpenAI(model="gpt-4o", max_tokens=1024)}), + ], ) def test_image_parser_with_differents_format_and_files( - blob_loader, #: _ImageBlobParser, - kw:dict[str,any], - blob: Blob, - body: str, - format: str, - pattern: str, - ) -> None: - if blob_loader == MultimodalBlobParser: - body=".*building.*" - documents = list(blob_loader(format=format,**kw).lazy_parse(blob)) - assert (len(documents) == 1) + blob_loader: Type, + kw: dict[str, Any], + format: str, + pattern: str, + blob: Blob, + body: str, +) -> None: + if blob_loader == MultimodalBlobParser and "building" in str(blob.path): + body = ".*building.*" + documents = list(blob_loader(format=format, **kw).lazy_parse(blob)) + assert len(documents) == 1 assert re.compile(pattern.format(body=body)).match(documents[0].page_content) diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 1f22ba8c24be10..c1b8e43caa7b3d 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from typing import Sequence, Union, Type +from typing import Sequence, Union import pytest @@ -209,7 +209,7 @@ def test_amazontextract_loader_failures() -> None: @pytest.mark.parametrize( - "loader_class,params", + "parser_factory,params", [ ("PyMuPDFLoader", {}), ],