Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

feat(document): repair pdf with libreoffic #374

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion operator/document/v0/convert_to_images.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,18 @@ func ConvertDocumentToImage(inputStruct *ConvertDocumentToImagesInput) (*Convert
base64PDF = strings.Split(inputStruct.Document, ",")[1]
}

var base64PDFWithoutMime string
if RequiredToRepair(base64PDF) {
base64PDFWithoutMime, err = RepairPDF(base64PDF)
if err != nil {
return nil, fmt.Errorf("failed to repair PDF: %w", err)
}
} else {
base64PDFWithoutMime = base.TrimBase64Mime(base64PDF)
}

paramsJSON := map[string]interface{}{
"PDF": base.TrimBase64Mime(base64PDF),
"PDF": base64PDFWithoutMime,
"filename": inputStruct.Filename,
}

Expand Down
21 changes: 21 additions & 0 deletions operator/document/v0/execution/pdf_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from io import BytesIO
import json
import base64
import sys

# TODO: Deal with the import error when running the code in the docker container
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chuang8511
I think this PR is not ready, right?

Copy link
Contributor Author

@chuang8511 chuang8511 Sep 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Deal with the import error when running the code in the docker container

I will deal with this part with other PR.
But, this PR is not ready because there is a bug in the container for prod. I may need to modify this PR. So, I draft it first.

# from pdf_to_markdown import PDFTransformer

if __name__ == "__main__":
json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
pdf_string = params["PDF"]

decoded_bytes = base64.b64decode(pdf_string)
pdf_file_obj = BytesIO(decoded_bytes)
pdf = PDFTransformer(x=pdf_file_obj)
pages = pdf.raw_pages
output = {
"required": len(pages) == 0,
}
print(json.dumps(output))
43 changes: 43 additions & 0 deletions operator/document/v0/helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package document

import (
"encoding/json"
"log"

"github.com/instill-ai/component/base"
"github.com/instill-ai/component/internal/util"
)

func RequiredToRepair(pdfBase64 string) bool {

paramsJSON := map[string]interface{}{
"PDF": base.TrimBase64Mime(pdfBase64),
}

pythonCode := pdfTransformer + pdfChecker

outputBytes, err := util.ExecutePythonCode(pythonCode, paramsJSON)

if err != nil {
// It shouldn't block the original process.
log.Println("failed to run python script: %w", err)
return false
}

var output struct {
Repair bool `json:"required"`
}

err = json.Unmarshal(outputBytes, &output)

if err != nil {
// It shouldn't block the original process.
log.Println("failed to unmarshal output: %w", err)
}

return output.Repair
}

func RepairPDF(pdfBase64 string) (string, error) {
return ConvertToPDF(pdfBase64, "pdf")
}
3 changes: 3 additions & 0 deletions operator/document/v0/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ var (
//go:embed execution/task_convert_to_images.py
taskConvertToImagesExecution string

//go:embed execution/pdf_checker.py
pdfChecker string

once sync.Once
comp *component
)
Expand Down
13 changes: 12 additions & 1 deletion operator/document/v0/pdf_to_markdown_converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,19 @@ type converterOutput struct {

func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool, displayAllPage bool) (converterOutput, error) {

var pdfBase64 string
var err error
if RequiredToRepair(base64Text) {
pdfBase64, err = RepairPDF(base64Text)
if err != nil {
return converterOutput{}, fmt.Errorf("failed to repair PDF: %w", err)
}
} else {
pdfBase64 = base.TrimBase64Mime(base64Text)
}

paramsJSON, err := json.Marshal(map[string]interface{}{
"PDF": base.TrimBase64Mime(base64Text),
"PDF": pdfBase64,
"display-image-tag": displayImageTag,
"display-all-page-image": displayAllPage,
})
Expand Down
Loading