From 3fe4ec5f27b25c17e49b617c50f7fecd00de31da Mon Sep 17 00:00:00 2001
From: Philippe Prados <github@prados.fr>
Date: Fri, 10 Jan 2025 13:45:02 +0100
Subject: [PATCH] Fix remarques

---
 .../document_loaders/parsers/__init__.py      | 10 +--
 .../document_loaders/parsers/images.py        | 45 +++++-----
 .../document_loaders/parsers/pdf.py           | 90 +++++++++++++------
 .../document_loaders/pdf.py                   | 18 ++--
 .../parsers/test_pdf_parsers.py               | 37 +++++---
 .../document_loaders/test_images.py           | 53 +++++------
 .../document_loaders/test_pdf.py              |  4 +-
 7 files changed, 152 insertions(+), 105 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py
index 790402dd3cf428..5f5a9f9d96f3cb 100644
--- a/libs/community/langchain_community/document_loaders/parsers/__init__.py
+++ b/libs/community/langchain_community/document_loaders/parsers/__init__.py
@@ -17,6 +17,11 @@
     from langchain_community.document_loaders.parsers.html import (
         BS4HTMLParser,
     )
+    from langchain_community.document_loaders.parsers.images import (
+        MultimodalBlobParser,
+        RapidOCRBlobParser,
+        TesseractBlobParser,
+    )
     from langchain_community.document_loaders.parsers.language import (
         LanguageParser,
     )
@@ -30,11 +35,6 @@
     from langchain_community.document_loaders.parsers.vsdx import (
         VsdxParser,
     )
-    from langchain_community.document_loaders.parsers.images import (
-        MultimodalBlobParser,
-        RapidOCRBlobParser,
-        TesseractBlobParser,
-    )
 
 
 _module_lookup = {
diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py
index 496131fafd0d5f..f355566a5cdaff 100644
--- a/libs/community/langchain_community/document_loaders/parsers/images.py
+++ b/libs/community/langchain_community/document_loaders/parsers/images.py
@@ -3,24 +3,24 @@
 import io
 import logging
 from abc import abstractmethod
-
-from PIL import Image
 from typing import Iterator, Literal
 
-from langchain_community.document_loaders.base import BaseBlobParser
-from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage
+from PIL.Image import Image
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
 
 logger = logging.getLogger(__name__)
 
 
 class ImageBlobParser(BaseBlobParser):
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
     ):
         self.format = format
 
@@ -47,9 +47,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
 
 class RapidOCRBlobParser(ImageBlobParser):
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
     ):
         super().__init__(format=format)
         self.ocr = None
@@ -72,13 +72,11 @@ def _analyze_image(self, img: Image) -> str:
 
 
 class TesseractBlobParser(ImageBlobParser):
-
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
-            langs: list[str] = ["eng"],
-
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
+        langs: list[str] = ["eng"],
     ):
         super().__init__(format=format)
         self.langs = langs
@@ -99,18 +97,17 @@ def _analyze_image(self, img: Image) -> str:
     "images for retrieval. "
     "These summaries will be embedded and used to retrieve the raw image. "
     "Give a concise summary of the image that is well optimized for retrieval "
-    "and extract all the text from the image.")
+    "and extract all the text from the image."
+)
 
 
 class MultimodalBlobParser(ImageBlobParser):
-
     def __init__(
-            self,
-            *,
-            format: Literal["text", "markdown", "html"] = "text",
-            model: BaseChatModel,
-            prompt: str = _prompt_images_to_description,
-
+        self,
+        *,
+        format: Literal["text", "markdown", "html"] = "text",
+        model: BaseChatModel,
+        prompt: str = _prompt_images_to_description,
     ):
         super().__init__(format=format)
         self.model = model
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 5933e3220737f3..d053528799ba19 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -2,18 +2,15 @@
 
 from __future__ import annotations
 
-import html
 import io
 import logging
 import threading
 import warnings
 from datetime import datetime
-from urllib.parse import urlparse
-
-import numpy as np
 from typing import (
     TYPE_CHECKING,
     Any,
+    Iterable,
     Iterator,
     Literal,
     Mapping,
@@ -21,12 +18,17 @@
     Sequence,
     Union,
 )
+from urllib.parse import urlparse
+
+import numpy as np
+from langchain_core.documents import Document
 
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
-from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
-    RapidOCRBlobParser
-from langchain_core.documents import Document
+from langchain_community.document_loaders.parsers.images import (
+    ImageBlobParser,
+    RapidOCRBlobParser,
+)
 
 if TYPE_CHECKING:
     import pdfminer
@@ -53,6 +55,38 @@
     "JBIG2Decode",
 ]
 
+
+def extract_from_images_with_rapidocr(
+    images: Sequence[Union[Iterable[np.ndarray], bytes]],
+) -> str:
+    """Extract text from images with RapidOCR.
+
+    Args:
+        images: Images to extract text from.
+
+    Returns:
+        Text extracted from images.
+
+    Raises:
+        ImportError: If `rapidocr-onnxruntime` package is not installed.
+    """
+    try:
+        from rapidocr_onnxruntime import RapidOCR
+    except ImportError:
+        raise ImportError(
+            "`rapidocr-onnxruntime` package not found, please install it with "
+            "`pip install rapidocr-onnxruntime`"
+        )
+    ocr = RapidOCR()
+    text = ""
+    for img in images:
+        result, _ = ocr(img)
+        if result:
+            result = [text[1] for text in result]
+            text += "\n".join(result)
+    return text
+
+
 logger = logging.getLogger(__name__)
 
 _FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
@@ -60,9 +94,10 @@
 _JOIN_TABLES = "\n"
 _DEFAULT_PAGE_DELIMITOR = "\n\f"
 
-_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"}
+_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
+
 
-def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
+def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
     """Validates the presence of at least the following keys:
     - source
     - page (if mode='page')
@@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
     """
     if not _STD_METADATA_KEYS.issubset(metadata.keys()):
         raise ValueError("The PDF parser must valorize the standard metadata.")
-    if not isinstance(metadata.get("page",0), int):
+    if not isinstance(metadata.get("page", 0), int):
         raise ValueError("The PDF metadata page must be a integer.")
     return metadata
 
@@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
     return new_metadata
 
 
-_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"]  # To insert images or table in the middle of the page.
+_PARAGRAPH_DELIMITOR = [
+    "\n\n\n",
+    "\n\n",
+]  # To insert images or table in the middle of the page.
 
 
 def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
@@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
     """
 
     def _recurs_merge_text_and_extras(
-            extras: list[str], text_from_page: str, recurs: bool
+        extras: list[str], text_from_page: str, recurs: bool
     ) -> Optional[str]:
         if extras:
             for delim in _PARAGRAPH_DELIMITOR:
@@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras(
                         str_extras = "\n\n".join(filter(lambda x: x, extras))
                         if str_extras:
                             all_extras = delim + str_extras
-                        all_text = text_from_page[:pos] + all_extras + text_from_page[
-                                                                       pos:]
+                        all_text = (
+                            text_from_page[:pos] + all_extras + text_from_page[pos:]
+                        )
                     break
             else:
                 all_text = None
@@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras(
     return all_text
 
 
-
 class ImagesPdfParser(BaseBlobParser):
     """Abstract interface for blob parsers with images_to_text."""
 
@@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
             )
 
         def _extract_text_from_page(page: pypdf.PageObject) -> str:
-            """Extract text from image given the version of pypdf.
-            """
+            """Extract text from image given the version of pypdf."""
             if pypdf.__version__.startswith("3"):
                 return page.extract_text()
             else:
@@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                 for page in doc:
                     all_text = self._get_page_content(doc, page, blob).strip()
                     if self.mode == "page":
-
                         yield Document(
                             page_content=all_text,
-                            metadata=_validate_metadata(doc_metadata |
-                                                        {"page": page.number}),
+                            metadata=_validate_metadata(
+                                doc_metadata | {"page": page.number}
+                            ),
                         )
                     else:
                         full_content.append(all_text)
@@ -658,17 +695,16 @@ def _extract_images_from_page(
             if self.images_parser:
                 xref = img[0]
                 pix = pymupdf.Pixmap(doc, xref)
-                image=np.frombuffer(pix.samples, dtype=np.uint8).reshape(
-                        pix.height, pix.width, -1
-                    )
+                image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+                    pix.height, pix.width, -1
+                )
                 image_bytes = io.BytesIO()
                 Image.fromarray(image).save(image_bytes, format="PNG")
-                blob=Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
+                blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
                 images.append(next(self.images_parser.lazy_parse(blob)).page_content)
         return _FORMAT_IMAGE_STR.format(
-                image_text=_JOIN_IMAGES.join(filter(None,images))
-            )
-
+            image_text=_JOIN_IMAGES.join(filter(None, images))
+        )
 
     def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
         """Extract tables from a PDF page.
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
index f39c02fe921f8a..4af3aefdb8895a 100644
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -7,9 +7,6 @@
 from abc import ABC
 from io import StringIO
 from pathlib import Path, PurePath
-from urllib.parse import urlparse
-
-import requests
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -22,13 +19,21 @@
     Union,
     cast,
 )
+from urllib.parse import urlparse
+
+import requests
+from langchain_core.documents import Document
+from langchain_core.utils import get_from_dict_or_env
 
 from langchain_community.document_loaders.base import BaseLoader
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.dedoc import DedocBaseLoader
-from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
-    RapidOCRBlobParser
+from langchain_community.document_loaders.parsers.images import (
+    ImageBlobParser,
+    RapidOCRBlobParser,
+)
 from langchain_community.document_loaders.parsers.pdf import (
+    _DEFAULT_PAGE_DELIMITOR,
     AmazonTextractPDFParser,
     DocumentIntelligenceParser,
     PDFMinerParser,
@@ -36,11 +41,8 @@
     PyMuPDFParser,
     PyPDFium2Parser,
     PyPDFParser,
-    _DEFAULT_PAGE_DELIMITOR,
 )
 from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
-from langchain_core.documents import Document
-from langchain_core.utils import get_from_dict_or_env
 
 if TYPE_CHECKING:
     from textractor.data.text_linearization_config import TextLinearizationConfig
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
index 5ebbeda5642e7d..0fa312f741bbc7 100644
--- a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
+++ b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -2,10 +2,9 @@
 
 import re
 from pathlib import Path
+from typing import Iterator
 
-import numpy as np
 import pytest
-from typing import Iterator, Type
 
 import langchain_community.document_loaders.parsers as pdf_parsers
 from langchain_community.document_loaders.base import BaseBlobParser
@@ -14,7 +13,7 @@
     PDFMinerParser,
     PDFPlumberParser,
     PyPDFium2Parser,
-    PyPDFParser, PyMuPDFParser,
+    PyPDFParser,
 )
 
 # PDFs to test parsers on.
@@ -135,13 +134,11 @@ def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
 
 @pytest.mark.parametrize(
     "mode",
-    # ["single", "page"],
-    ["single"],  # FIXME
+    ["single", "page"],
 )
 @pytest.mark.parametrize(
     "extract_images",
-    # [True, False],
-    [True],  # FIXME
+    [True, False],
 )
 @pytest.mark.parametrize(
     "parser_factory,params",
@@ -152,8 +149,15 @@ def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
 def test_mode_and_extract_images_variations(
     parser_factory: str, params: dict, mode: str, extract_images: bool
 ) -> None:
+    """Apply the same test for all *standard* PDF parsers.
+
+    - Try with mode `single` and `page`
+    - Try with extract_images `true` and `false`
+    """
+    from PIL.Image import Image
+
     from langchain_community.document_loaders.parsers.images import ImageBlobParser
-    from PIL import Image
+
     def _std_assert_with_parser(parser: BaseBlobParser) -> None:
         """Standard tests to verify that the given parser works.
 
@@ -221,15 +225,19 @@ def _analyze_image(self, img: Image) -> str:
 @pytest.mark.parametrize(
     "parser_class,params",
     [
-        (PyMuPDFParser, {}),
+        ("PyMuPDFParser", {}),
     ],
 )
 def test_parser_with_table(
-    parser_class: Type,
+    parser_factory: str,
     params: dict,
     mode: str,
     extract_tables: str,
 ) -> None:
+    from PIL.Image import Image
+
+    from langchain_community.document_loaders.parsers.images import ImageBlobParser
+
     def _std_assert_with_parser(parser: BaseBlobParser) -> None:
         """Standard tests to verify that the given parser works.
 
@@ -271,15 +279,16 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
         else:
             assert not len(tables)
 
+    class EmptyImageBlobParser(ImageBlobParser):
+        def _analyze_image(self, img: Image) -> str:
+            return "![image](.)"
 
-
-    def images_to_text(images: list[np.ndarray]) -> Iterator[str]:
-        return iter(["<!-- image -->"] * len(images))
+    parser_class = getattr(pdf_parsers, parser_factory)
 
     parser = parser_class(
         mode=mode,
         extract_tables=extract_tables,
-        # images_to_text=images_to_text, # FIXME
+        images_parser=EmptyImageBlobParser(),
         **params,
     )
     _std_assert_with_parser(parser)
diff --git a/libs/community/tests/integration_tests/document_loaders/test_images.py b/libs/community/tests/integration_tests/document_loaders/test_images.py
index f2acb1c2a9a91e..57ac6cd66e615d 100644
--- a/libs/community/tests/integration_tests/document_loaders/test_images.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_images.py
@@ -1,51 +1,54 @@
 import re
 from pathlib import Path
-from typing import Any
+from typing import Any, Type
 
 import pytest
-
-from langchain_community.document_loaders.parsers.images import RapidOCRBlobParser, \
-    _ImageBlobParser, TesseractBlobParser, MultimodalBlobParser
 from langchain_core.documents.base import Blob
 from langchain_openai import ChatOpenAI
 
+from langchain_community.document_loaders.parsers.images import (
+    MultimodalBlobParser,
+    RapidOCRBlobParser,
+    TesseractBlobParser,
+)
+
 building_image = Blob.from_path(Path(__file__).parent.parent / "examples/building.jpg")
 text_image = Blob.from_path(Path(__file__).parent.parent / "examples/text.png")
 
 
 @pytest.mark.parametrize(
-    'blob,body',
+    "blob,body",
     [
         (building_image, ""),
-        (text_image, r".*\bMAKE *TEXT\b.*\bSTAND\b.*\bOUT *FROM\b.*\bBACKGROUNDS\b.*")
-    ]
+        (text_image, r".*\bMAKE *TEXT\b.*\bSTAND\b.*\bOUT *FROM\b.*\bBACKGROUNDS\b.*"),
+    ],
 )
 @pytest.mark.parametrize(
     "format,pattern",
     [
-        ("text", r"(?sm)^{body}$"),
-        ("markdown", r"(?sm)^!\[{body}]\(\.\)|$"),
-        ("html", r'(?sm)^(<img alt="{body}" />|)'),
+        ("text", r"(?ism)^{body}$"),
+        ("markdown", r"(?ism)^!\[{body}]\(\.\)|$"),
+        ("html", r'(?ism)^(<img alt="{body}" />|)'),
     ],
 )
 @pytest.mark.parametrize(
     "blob_loader,kw",
     [
-        (RapidOCRBlobParser,{}),
-        (TesseractBlobParser,{}),
-        (MultimodalBlobParser,{"model":ChatOpenAI(model="gpt-4o", max_tokens=1024)})
-    ]
+        (RapidOCRBlobParser, {}),
+        (TesseractBlobParser, {}),
+        (MultimodalBlobParser, {"model": ChatOpenAI(model="gpt-4o", max_tokens=1024)}),
+    ],
 )
 def test_image_parser_with_differents_format_and_files(
-        blob_loader, #: _ImageBlobParser,
-        kw:dict[str,any],
-        blob: Blob,
-        body: str,
-        format: str,
-        pattern: str,
-    ) -> None:
-    if blob_loader == MultimodalBlobParser:
-        body=".*building.*"
-    documents = list(blob_loader(format=format,**kw).lazy_parse(blob))
-    assert (len(documents) == 1)
+    blob_loader: Type,
+    kw: dict[str, Any],
+    format: str,
+    pattern: str,
+    blob: Blob,
+    body: str,
+) -> None:
+    if blob_loader == MultimodalBlobParser and "building" in str(blob.path):
+        body = ".*building.*"
+    documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
+    assert len(documents) == 1
     assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
index 1f22ba8c24be10..c1b8e43caa7b3d 100644
--- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py
+++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@@ -1,6 +1,6 @@
 import os
 from pathlib import Path
-from typing import Sequence, Union, Type
+from typing import Sequence, Union
 
 import pytest
 
@@ -209,7 +209,7 @@ def test_amazontextract_loader_failures() -> None:
 
 
 @pytest.mark.parametrize(
-    "loader_class,params",
+    "parser_factory,params",
     [
         ("PyMuPDFLoader", {}),
     ],