fix(document): fix the bug from pdfplumber (#342)

Because - pdfplumber read the images with the wrong position This commit - catch the error if the position is wrong
instill-ai · Sep 17, 2024 · 72f4931 · 72f4931
1 parent d893ccd
commit 72f4931
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 7 deletions.
diff --git a/operator/document/v0/README.mdx b/operator/document/v0/README.mdx
@@ -57,6 +57,7 @@ Convert document to text in Markdown format.
 | Body | `body` | string | Markdown text converted from the PDF document |
 | Filename (optional) | `filename` | string | The name of the file |
 | Images (optional) | `images` | array[string] | Images extracted from the document |
+| Error (optional) | `error` | string | Error message if any during the conversion process |
 
 
 

diff --git a/operator/document/v0/config/tasks.json b/operator/document/v0/config/tasks.json
@@ -80,6 +80,13 @@
           },
           "title": "Images",
           "type": "array"
+        },
+        "error": {
+          "description": "Error message if any during the conversion process",
+          "instillFormat": "string",
+          "instillUIOrder": 3,
+          "title": "Error",
+          "type": "string"
         }
       },
       "required": [

diff --git a/operator/document/v0/convert_document_to_markdown.go b/operator/document/v0/convert_document_to_markdown.go
@@ -19,6 +19,7 @@ type ConvertDocumentToMarkdownOutput struct {
 	Body     string   `json:"body"`
 	Filename string   `json:"filename"`
 	Images   []string `json:"images,omitempty"`
+	Error    string   `json:"error,omitempty"`
 }
 
 func ConvertDocumentToMarkdown(inputStruct *ConvertDocumentToMarkdownInput, transformerGetter MarkdownTransformerGetterFunc) (*ConvertDocumentToMarkdownOutput, error) {
@@ -47,6 +48,7 @@ func ConvertDocumentToMarkdown(inputStruct *ConvertDocumentToMarkdownInput, tran
 	outputStruct := &ConvertDocumentToMarkdownOutput{
 		Body:   converterOutput.Body,
 		Images: converterOutput.Images,
+		Error:  strings.Join(converterOutput.Error, "\n"),
 	}
 
 	if inputStruct.Filename != "" {

diff --git a/operator/document/v0/pdf_to_markdown_converter.go b/operator/document/v0/pdf_to_markdown_converter.go
@@ -11,7 +11,7 @@ import (
 type converterOutput struct {
 	Body   string   `json:"body"`
 	Images []string `json:"images"`
-	Error  string   `json:"error"`
+	Error  []string `json:"error"`
 }
 
 func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool) (converterOutput, error) {
@@ -54,8 +54,8 @@ func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool)
 	}
 
 	err = json.Unmarshal(outputBytes, &output)
-	if err != nil || output.Error != "" {
-		return output, fmt.Errorf("failed to unmarshal output: %w, %s", err, output.Error)
+	if err != nil {
+		return output, fmt.Errorf("failed to unmarshal output: %w", err)
 	}
 
 	return output, nil

diff --git a/operator/document/v0/python/transformPDFToMarkdown.py b/operator/document/v0/python/transformPDFToMarkdown.py
@@ -14,6 +14,7 @@ def __init__(self, x, display_image_tag=False, image_index=0):
 		self.metadata = self.pdf.metadata
 		self.display_image_tag = display_image_tag
 		self.image_index = image_index
+		self.errors = []
 
 	def preprocess(self):
 		self.set_heights()
@@ -43,14 +44,20 @@ def process_image(self, i):
 				image["page_number"] = page.page_number
 				image["img_number"] = i
 				i += 1
-				img_base64 = self.encode_image(image, page)
+				img_base64 = self.encode_image(image, page, i)
 				image["img_base64"] = img_base64
 				self.images.append(image)
 		self.image_index = i
 
-	def encode_image(self, image, page):
+	def encode_image(self, image, page, i):
 		bbox = [image['x0'], page.cropbox[3]-image['y1'],  image['x1'], page.cropbox[3]-image['y0']]
-		img_page = page.crop(bbox=bbox)
+		# There is a bug in pdfplumber that it can't target the image position correctly.
+		try:
+			img_page = page.crop(bbox=bbox)
+		except Exception as e:
+			self.errors.append(f"image {i} got error: {str(e)}, so it convert all pages into image.")
+			img_page = page
+
 		img_obj = img_page.to_image(resolution=500)
 		buffer = BytesIO()
 		img_obj.save(buffer, format="PNG")
@@ -441,6 +448,7 @@ def insert_image(self, line, next_line):
 	images = []
 	separator_number = 30
 	image_idx = 0
+	errors = []
 
 	try:
 		times = len(pdf.raw_pages) // separator_number + 1
@@ -456,11 +464,14 @@ def insert_image(self, line, next_line):
 			result += pdf.execute()
 			for image in pdf.base64_images:
 				images.append(image)
+
+			errors += pdf.errors
 
 		output = {
 			"body": result,
 			"images": images,
+			"error": errors
 		}
 		print(json.dumps(output))
 	except Exception as e:
-		print(json.dumps({"error": str(e)}))
+		print(json.dumps({"error": [str(e)]}))