Skip to content

Commit

Permalink
Merge pull request #43 from arena-ai/42-add-the-as_png-op
Browse files Browse the repository at this point in the history
add as_png op
  • Loading branch information
ngrislain authored Nov 19, 2024
2 parents 049051b + 46fe64b commit 5179fe2
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 2 deletions.
13 changes: 12 additions & 1 deletion backend/app/api/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

from app.api.deps import CurrentUser
from app.services.object_store import documents
from app.ops.documents import path, paths, as_text
from app.ops.documents import path, paths, as_text, as_png

from app.models import Message


Expand Down Expand Up @@ -92,3 +93,13 @@ async def read_file_as_text(
end_page: int | None = None,
) -> str:
return await as_text(current_user, name, start_page, end_page).evaluate()

@router.get("/{name}/as_png")
async def read_file_as_png(
*,
current_user: CurrentUser,
name: str,
start_page: int = 0,
end_page: int | None = None,
) -> None:
return await as_png(current_user, name, start_page, end_page).evaluate()
36 changes: 36 additions & 0 deletions backend/app/ops/documents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from io import BytesIO
import pandas as pd

from app.models import User
from app.ops import Op
Expand Down Expand Up @@ -54,6 +55,7 @@ async def call(
path_as_text = f"{source_path}as_text_from_page_{start_page}"
else:
path_as_text = f"{source_path}as_text"

if not documents.exists(path_as_text):
# The doc should be created
if content_type == "application/pdf":
Expand All @@ -63,6 +65,12 @@ async def call(
input, start_page=start_page, end_page=end_page
),
)
elif content_type == "application/vnd.ms-excel":
df = pd.read_excel(input)
documents.puts(
path_as_text,
df.to_csv(index=False)
)
else:
documents.puts(path_as_text, "Error: Could not read as text")
# output the file
Expand All @@ -71,3 +79,31 @@ async def call(


as_text = AsText()


class AsPng(Op[tuple[User, str], str]):
async def call(
self,
user: User,
name: str,
start_page: int = 0,
end_page: int | None = None,
) -> None:
source_path = await path.call(user, name)
input = BytesIO(documents.get(f"{source_path}data").read())
content_type = documents.gets(f"{source_path}content_type")

pages_bytes = pdf_reader.as_png(input, start_page=start_page, end_page=end_page)

for page, byte_stream in pages_bytes:
path_as_png = f"{source_path}as_png_page_{page}"
if content_type == "application/pdf":
documents.put(
path_as_png,
byte_stream
),
else:
documents.puts(path_as_png, "Error: Could not read as png")


as_png = AsPng()
28 changes: 27 additions & 1 deletion backend/app/services/pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import pymupdf
import pytesseract
from pdf2image import convert_from_bytes

from PIL import Image
from io import BytesIO

@dataclass
class PDFReader:
Expand Down Expand Up @@ -44,6 +45,31 @@ def perform_ocr_on_page(self, pdf_data: BinaryIO) -> str:
text += ocr_text

return text

def as_png(
self,
pdf_data: BinaryIO,
start_page: int = 0,
end_page: int | None = None,
) -> list[tuple[int, BinaryIO]]:

doc = pymupdf.Document(stream=pdf_data)
pages = [
page_num
for page_num, page in enumerate(doc)
if page_num >= start_page and (not end_page or page_num < end_page)
]
page_buffer_pairs = []

for page_num in pages:
page = doc.load_page(page_num)
pix = page.get_pixmap(dpi=96, colorspace="csRGB", alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
buffer = BytesIO()
img.save(buffer, format="PNG", optimize=True, compress_level=0)
page_buffer_pairs.append((page_num, buffer))

return page_buffer_pairs


# A default instance
Expand Down
1 change: 1 addition & 0 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ minio = "^7.2.8"
pymupdf = "^1.24.10"
pytesseract = "^0.3.13"
pdf2image = "^1.17.0"
pandas = "^2.2.3"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3"
Expand Down

0 comments on commit 5179fe2

Please sign in to comment.