diff --git a/app/serverless.py b/app/serverless.py index ddeaa48..9f28d7f 100644 --- a/app/serverless.py +++ b/app/serverless.py @@ -2,6 +2,7 @@ import os from pathlib import Path from uuid import uuid4 +import shutil import magic_pdf.model as model_config import runpod @@ -15,28 +16,15 @@ model_config.__model_mode__ = "full" _tmp_dir = "/tmp/{uuid}" -_local_image_dir = "/tmp/{uuid}/images" -def handler(event): - try: - # Extract base64 encoded file and filename from the event - input_data = event.get("input", {}) - base64_content = input_data.get("file_content") - filename = input_data.get("filename") - - if not base64_content or not filename: - return {"error": "Missing file_content or filename"} - - # Decode base64 content - pdf_bytes = base64.b64decode(base64_content) - - # Set up temporary directories - uuid_str = str(uuid4()) - tmp_dir = _tmp_dir.format(uuid=uuid_str) - local_image_dir = _local_image_dir.format(uuid=uuid_str) - os.makedirs(tmp_dir, exist_ok=True) - os.makedirs(local_image_dir, exist_ok=True) +def convert_to_markdown(pdf_bytes, tmp_dir, filename): + """Convert file to markdown and handle office document conversion if needed""" + # Set up temporary directories + local_image_dir = f"{tmp_dir}/images" + os.makedirs(tmp_dir, exist_ok=True) + os.makedirs(local_image_dir, exist_ok=True) + try: # Handle office documents conversion if filename.endswith(OfficeExts.__args__): input_file: Path = Path(tmp_dir) / filename @@ -46,7 +34,7 @@ def handler(event): office_converter.convert(input_file, output_file) pdf_bytes = output_file.read_bytes() elif not filename.endswith(".pdf"): - return {"error": "Unsupported file type"} + raise ValueError("Unsupported file type") # Process PDF image_writer = DiskReaderWriter(local_image_dir) @@ -55,11 +43,48 @@ def handler(event): pipe.pipe_classify() pipe.pipe_analyze() pipe.pipe_parse() - md_content = pipe.pipe_mk_markdown(local_image_dir, drop_mode="none") + return pipe.pipe_mk_markdown(local_image_dir, drop_mode="none") + finally: + # Clean up temporary directory + shutil.rmtree(tmp_dir, ignore_errors=True) + +def setup(): + # Create a sample directory for initialization + sample_dir = "../pdfs" + filename = "small_ocr2.pdf" + sample_pdf_path = os.path.join(sample_dir, filename) + + with open(sample_pdf_path, "rb") as f: + sample_pdf_bytes = f.read() + + # Warm up the conversion process + convert_to_markdown(sample_pdf_bytes, sample_dir, filename) +def handler(event): + try: + # Extract base64 encoded file and filename from the event + input_data = event.get("input", {}) + base64_content = input_data.get("file_content") + filename = input_data.get("filename") + + if not base64_content or not filename: + return {"error": "Missing file_content or filename"} + + # Decode base64 content + pdf_bytes = base64.b64decode(base64_content) + + # Create unique temporary directory + uuid_str = str(uuid4()) + tmp_dir = _tmp_dir.format(uuid=uuid_str) + + # Convert file to markdown + md_content = convert_to_markdown(pdf_bytes, tmp_dir, filename) return {"markdown": md_content} except Exception as e: return {"error": str(e)} +# Call setup to initiate and warm up resources +setup() + runpod.serverless.start({"handler": handler}) \ No newline at end of file diff --git a/pdfs/small_ocr2.pdf b/pdfs/small_ocr2.pdf new file mode 100644 index 0000000..ac1c943 Binary files /dev/null and b/pdfs/small_ocr2.pdf differ