Skip to content

Commit

Permalink
Fix remarques
Browse files Browse the repository at this point in the history
  • Loading branch information
pprados committed Jan 10, 2025
1 parent 20f5a41 commit 3fe4ec5
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from langchain_community.document_loaders.parsers.html import (
BS4HTMLParser,
)
from langchain_community.document_loaders.parsers.images import (
MultimodalBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)
from langchain_community.document_loaders.parsers.language import (
LanguageParser,
)
Expand All @@ -30,11 +35,6 @@
from langchain_community.document_loaders.parsers.vsdx import (
VsdxParser,
)
from langchain_community.document_loaders.parsers.images import (
MultimodalBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)


_module_lookup = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,24 @@
import io
import logging
from abc import abstractmethod

from PIL import Image
from typing import Iterator, Literal

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_core.documents import Document
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import HumanMessage
from PIL.Image import Image

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)


class ImageBlobParser(BaseBlobParser):
def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
self,
*,
format: Literal["text", "markdown", "html"] = "text",
):
self.format = format

Expand All @@ -47,9 +47,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:

class RapidOCRBlobParser(ImageBlobParser):
def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
self,
*,
format: Literal["text", "markdown", "html"] = "text",
):
super().__init__(format=format)
self.ocr = None
Expand All @@ -72,13 +72,11 @@ def _analyze_image(self, img: Image) -> str:


class TesseractBlobParser(ImageBlobParser):

def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
langs: list[str] = ["eng"],

self,
*,
format: Literal["text", "markdown", "html"] = "text",
langs: list[str] = ["eng"],
):
super().__init__(format=format)
self.langs = langs
Expand All @@ -99,18 +97,17 @@ def _analyze_image(self, img: Image) -> str:
"images for retrieval. "
"These summaries will be embedded and used to retrieve the raw image. "
"Give a concise summary of the image that is well optimized for retrieval "
"and extract all the text from the image.")
"and extract all the text from the image."
)


class MultimodalBlobParser(ImageBlobParser):

def __init__(
self,
*,
format: Literal["text", "markdown", "html"] = "text",
model: BaseChatModel,
prompt: str = _prompt_images_to_description,

self,
*,
format: Literal["text", "markdown", "html"] = "text",
model: BaseChatModel,
prompt: str = _prompt_images_to_description,
):
super().__init__(format=format)
self.model = model
Expand Down
90 changes: 63 additions & 27 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,33 @@

from __future__ import annotations

import html
import io
import logging
import threading
import warnings
from datetime import datetime
from urllib.parse import urlparse

import numpy as np
from typing import (
TYPE_CHECKING,
Any,
Iterable,
Iterator,
Literal,
Mapping,
Optional,
Sequence,
Union,
)
from urllib.parse import urlparse

import numpy as np
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
RapidOCRBlobParser
from langchain_core.documents import Document
from langchain_community.document_loaders.parsers.images import (
ImageBlobParser,
RapidOCRBlobParser,
)

if TYPE_CHECKING:
import pdfminer
Expand All @@ -53,16 +55,49 @@
"JBIG2Decode",
]


def extract_from_images_with_rapidocr(
images: Sequence[Union[Iterable[np.ndarray], bytes]],
) -> str:
"""Extract text from images with RapidOCR.
Args:
images: Images to extract text from.
Returns:
Text extracted from images.
Raises:
ImportError: If `rapidocr-onnxruntime` package is not installed.
"""
try:
from rapidocr_onnxruntime import RapidOCR
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install rapidocr-onnxruntime`"
)
ocr = RapidOCR()
text = ""
for img in images:
result, _ = ocr(img)
if result:
result = [text[1] for text in result]
text += "\n".join(result)
return text


logger = logging.getLogger(__name__)

_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
_JOIN_IMAGES = "\n"
_JOIN_TABLES = "\n"
_DEFAULT_PAGE_DELIMITOR = "\n\f"

_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"}
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}


def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Validates the presence of at least the following keys:
- source
- page (if mode='page')
Expand All @@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
"""
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
raise ValueError("The PDF parser must valorize the standard metadata.")
if not isinstance(metadata.get("page",0), int):
if not isinstance(metadata.get("page", 0), int):
raise ValueError("The PDF metadata page must be a integer.")
return metadata

Expand Down Expand Up @@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
return new_metadata


_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"] # To insert images or table in the middle of the page.
_PARAGRAPH_DELIMITOR = [
"\n\n\n",
"\n\n",
] # To insert images or table in the middle of the page.


def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
Expand All @@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
"""

def _recurs_merge_text_and_extras(
extras: list[str], text_from_page: str, recurs: bool
extras: list[str], text_from_page: str, recurs: bool
) -> Optional[str]:
if extras:
for delim in _PARAGRAPH_DELIMITOR:
Expand All @@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras(
str_extras = "\n\n".join(filter(lambda x: x, extras))
if str_extras:
all_extras = delim + str_extras
all_text = text_from_page[:pos] + all_extras + text_from_page[
pos:]
all_text = (
text_from_page[:pos] + all_extras + text_from_page[pos:]
)
break
else:
all_text = None
Expand All @@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras(
return all_text



class ImagesPdfParser(BaseBlobParser):
"""Abstract interface for blob parsers with images_to_text."""

Expand Down Expand Up @@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
)

def _extract_text_from_page(page: pypdf.PageObject) -> str:
"""Extract text from image given the version of pypdf.
"""
"""Extract text from image given the version of pypdf."""
if pypdf.__version__.startswith("3"):
return page.extract_text()
else:
Expand Down Expand Up @@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
for page in doc:
all_text = self._get_page_content(doc, page, blob).strip()
if self.mode == "page":

yield Document(
page_content=all_text,
metadata=_validate_metadata(doc_metadata |
{"page": page.number}),
metadata=_validate_metadata(
doc_metadata | {"page": page.number}
),
)
else:
full_content.append(all_text)
Expand Down Expand Up @@ -658,17 +695,16 @@ def _extract_images_from_page(
if self.images_parser:
xref = img[0]
pix = pymupdf.Pixmap(doc, xref)
image=np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.height, pix.width, -1
)
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.height, pix.width, -1
)
image_bytes = io.BytesIO()
Image.fromarray(image).save(image_bytes, format="PNG")
blob=Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
images.append(next(self.images_parser.lazy_parse(blob)).page_content)
return _FORMAT_IMAGE_STR.format(
image_text=_JOIN_IMAGES.join(filter(None,images))
)

image_text=_JOIN_IMAGES.join(filter(None, images))
)

def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
"""Extract tables from a PDF page.
Expand Down
18 changes: 10 additions & 8 deletions libs/community/langchain_community/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
from abc import ABC
from io import StringIO
from pathlib import Path, PurePath
from urllib.parse import urlparse

import requests
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -22,25 +19,30 @@
Union,
cast,
)
from urllib.parse import urlparse

import requests
from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.dedoc import DedocBaseLoader
from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
RapidOCRBlobParser
from langchain_community.document_loaders.parsers.images import (
ImageBlobParser,
RapidOCRBlobParser,
)
from langchain_community.document_loaders.parsers.pdf import (
_DEFAULT_PAGE_DELIMITOR,
AmazonTextractPDFParser,
DocumentIntelligenceParser,
PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
_DEFAULT_PAGE_DELIMITOR,
)
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_core.documents import Document
from langchain_core.utils import get_from_dict_or_env

if TYPE_CHECKING:
from textractor.data.text_linearization_config import TextLinearizationConfig
Expand Down
Loading

0 comments on commit 3fe4ec5

Please sign in to comment.