Skip to content

Commit

Permalink
setup?
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkosm committed Dec 27, 2024
1 parent 1526e3c commit 907709d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 22 deletions.
69 changes: 47 additions & 22 deletions app/serverless.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from pathlib import Path
from uuid import uuid4
import shutil

import magic_pdf.model as model_config
import runpod
Expand All @@ -15,28 +16,15 @@
model_config.__model_mode__ = "full"

_tmp_dir = "/tmp/{uuid}"
_local_image_dir = "/tmp/{uuid}/images"

def handler(event):
try:
# Extract base64 encoded file and filename from the event
input_data = event.get("input", {})
base64_content = input_data.get("file_content")
filename = input_data.get("filename")

if not base64_content or not filename:
return {"error": "Missing file_content or filename"}

# Decode base64 content
pdf_bytes = base64.b64decode(base64_content)

# Set up temporary directories
uuid_str = str(uuid4())
tmp_dir = _tmp_dir.format(uuid=uuid_str)
local_image_dir = _local_image_dir.format(uuid=uuid_str)
os.makedirs(tmp_dir, exist_ok=True)
os.makedirs(local_image_dir, exist_ok=True)
def convert_to_markdown(pdf_bytes, tmp_dir, filename):
"""Convert file to markdown and handle office document conversion if needed"""
# Set up temporary directories
local_image_dir = f"{tmp_dir}/images"
os.makedirs(tmp_dir, exist_ok=True)
os.makedirs(local_image_dir, exist_ok=True)

try:
# Handle office documents conversion
if filename.endswith(OfficeExts.__args__):
input_file: Path = Path(tmp_dir) / filename
Expand All @@ -46,7 +34,7 @@ def handler(event):
office_converter.convert(input_file, output_file)
pdf_bytes = output_file.read_bytes()
elif not filename.endswith(".pdf"):
return {"error": "Unsupported file type"}
raise ValueError("Unsupported file type")

# Process PDF
image_writer = DiskReaderWriter(local_image_dir)
Expand All @@ -55,11 +43,48 @@ def handler(event):
pipe.pipe_classify()
pipe.pipe_analyze()
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(local_image_dir, drop_mode="none")
return pipe.pipe_mk_markdown(local_image_dir, drop_mode="none")
finally:
# Clean up temporary directory
shutil.rmtree(tmp_dir, ignore_errors=True)

def setup():
# Create a sample directory for initialization
sample_dir = "../pdfs"
filename = "small_ocr2.pdf"
sample_pdf_path = os.path.join(sample_dir, filename)

with open(sample_pdf_path, "rb") as f:
sample_pdf_bytes = f.read()

# Warm up the conversion process
convert_to_markdown(sample_pdf_bytes, sample_dir, filename)

def handler(event):
try:
# Extract base64 encoded file and filename from the event
input_data = event.get("input", {})
base64_content = input_data.get("file_content")
filename = input_data.get("filename")

if not base64_content or not filename:
return {"error": "Missing file_content or filename"}

# Decode base64 content
pdf_bytes = base64.b64decode(base64_content)

# Create unique temporary directory
uuid_str = str(uuid4())
tmp_dir = _tmp_dir.format(uuid=uuid_str)

# Convert file to markdown
md_content = convert_to_markdown(pdf_bytes, tmp_dir, filename)
return {"markdown": md_content}

except Exception as e:
return {"error": str(e)}

# Call setup to initiate and warm up resources
setup()

runpod.serverless.start({"handler": handler})
Binary file added pdfs/small_ocr2.pdf
Binary file not shown.

0 comments on commit 907709d

Please sign in to comment.