Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

Commit

Permalink
fix(document): fix the bug from pdfplumber (#342)
Browse files Browse the repository at this point in the history
Because

- pdfplumber read the images with the wrong position

This commit

- catch the error if the position is wrong
  • Loading branch information
chuang8511 authored Sep 17, 2024
1 parent d893ccd commit 72f4931
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 7 deletions.
1 change: 1 addition & 0 deletions operator/document/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ Convert document to text in Markdown format.
| Body | `body` | string | Markdown text converted from the PDF document |
| Filename (optional) | `filename` | string | The name of the file |
| Images (optional) | `images` | array[string] | Images extracted from the document |
| Error (optional) | `error` | string | Error message if any during the conversion process |



Expand Down
7 changes: 7 additions & 0 deletions operator/document/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,13 @@
},
"title": "Images",
"type": "array"
},
"error": {
"description": "Error message if any during the conversion process",
"instillFormat": "string",
"instillUIOrder": 3,
"title": "Error",
"type": "string"
}
},
"required": [
Expand Down
2 changes: 2 additions & 0 deletions operator/document/v0/convert_document_to_markdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ type ConvertDocumentToMarkdownOutput struct {
Body string `json:"body"`
Filename string `json:"filename"`
Images []string `json:"images,omitempty"`
Error string `json:"error,omitempty"`
}

func ConvertDocumentToMarkdown(inputStruct *ConvertDocumentToMarkdownInput, transformerGetter MarkdownTransformerGetterFunc) (*ConvertDocumentToMarkdownOutput, error) {
Expand Down Expand Up @@ -47,6 +48,7 @@ func ConvertDocumentToMarkdown(inputStruct *ConvertDocumentToMarkdownInput, tran
outputStruct := &ConvertDocumentToMarkdownOutput{
Body: converterOutput.Body,
Images: converterOutput.Images,
Error: strings.Join(converterOutput.Error, "\n"),
}

if inputStruct.Filename != "" {
Expand Down
6 changes: 3 additions & 3 deletions operator/document/v0/pdf_to_markdown_converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
type converterOutput struct {
Body string `json:"body"`
Images []string `json:"images"`
Error string `json:"error"`
Error []string `json:"error"`
}

func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool) (converterOutput, error) {
Expand Down Expand Up @@ -54,8 +54,8 @@ func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool)
}

err = json.Unmarshal(outputBytes, &output)
if err != nil || output.Error != "" {
return output, fmt.Errorf("failed to unmarshal output: %w, %s", err, output.Error)
if err != nil {
return output, fmt.Errorf("failed to unmarshal output: %w", err)
}

return output, nil
Expand Down
19 changes: 15 additions & 4 deletions operator/document/v0/python/transformPDFToMarkdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def __init__(self, x, display_image_tag=False, image_index=0):
self.metadata = self.pdf.metadata
self.display_image_tag = display_image_tag
self.image_index = image_index
self.errors = []

def preprocess(self):
self.set_heights()
Expand Down Expand Up @@ -43,14 +44,20 @@ def process_image(self, i):
image["page_number"] = page.page_number
image["img_number"] = i
i += 1
img_base64 = self.encode_image(image, page)
img_base64 = self.encode_image(image, page, i)
image["img_base64"] = img_base64
self.images.append(image)
self.image_index = i

def encode_image(self, image, page):
def encode_image(self, image, page, i):
bbox = [image['x0'], page.cropbox[3]-image['y1'], image['x1'], page.cropbox[3]-image['y0']]
img_page = page.crop(bbox=bbox)
# There is a bug in pdfplumber that it can't target the image position correctly.
try:
img_page = page.crop(bbox=bbox)
except Exception as e:
self.errors.append(f"image {i} got error: {str(e)}, so it convert all pages into image.")
img_page = page

img_obj = img_page.to_image(resolution=500)
buffer = BytesIO()
img_obj.save(buffer, format="PNG")
Expand Down Expand Up @@ -441,6 +448,7 @@ def insert_image(self, line, next_line):
images = []
separator_number = 30
image_idx = 0
errors = []

try:
times = len(pdf.raw_pages) // separator_number + 1
Expand All @@ -456,11 +464,14 @@ def insert_image(self, line, next_line):
result += pdf.execute()
for image in pdf.base64_images:
images.append(image)

errors += pdf.errors

output = {
"body": result,
"images": images,
"error": errors
}
print(json.dumps(output))
except Exception as e:
print(json.dumps({"error": str(e)}))
print(json.dumps({"error": [str(e)]}))

0 comments on commit 72f4931

Please sign in to comment.