From 7dc87dcbcd167da57ea18cfcbd80c5527d68c3f2 Mon Sep 17 00:00:00 2001 From: chuang8511 Date: Thu, 26 Sep 2024 10:59:44 +0100 Subject: [PATCH] feat(document): add repair function feat(document): add repair function --- operator/document/v0/python/transform_pdf_to_markdown.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/operator/document/v0/python/transform_pdf_to_markdown.py b/operator/document/v0/python/transform_pdf_to_markdown.py index 263380f9..ab69ce9c 100644 --- a/operator/document/v0/python/transform_pdf_to_markdown.py +++ b/operator/document/v0/python/transform_pdf_to_markdown.py @@ -24,7 +24,11 @@ class PdfTransformer: base64_images: list[dict] def __init__(self, x: BytesIO, display_image_tag: bool = False, image_index: int = 0): - self.pdf = pdfplumber.open(x) + try: + self.pdf = pdfplumber.open(x) + except Exception as e: + self.errors = [str(e)] + self.pdf = pdfplumber.open(x, repair=True) self.raw_pages = self.pdf.pages self.metadata = self.pdf.metadata self.display_image_tag = display_image_tag