From 656d9024bad9aaabc35d9f0a9812d2698689ced6 Mon Sep 17 00:00:00 2001 From: chuang8511 Date: Thu, 26 Sep 2024 20:07:04 +0100 Subject: [PATCH 1/3] feat(document): repair pdf with libreoffic --- operator/document/v0/convert_to_images.go | 12 +++++- operator/document/v0/execution/pdf_checker.py | 21 +++++++++ operator/document/v0/helper.go | 43 +++++++++++++++++++ operator/document/v0/main.go | 3 ++ .../document/v0/pdf_to_markdown_converter.go | 13 +++++- 5 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 operator/document/v0/execution/pdf_checker.py create mode 100644 operator/document/v0/helper.go diff --git a/operator/document/v0/convert_to_images.go b/operator/document/v0/convert_to_images.go index ebc9e0eb..1b1a5fe0 100644 --- a/operator/document/v0/convert_to_images.go +++ b/operator/document/v0/convert_to_images.go @@ -45,8 +45,18 @@ func ConvertDocumentToImage(inputStruct *ConvertDocumentToImagesInput) (*Convert base64PDF = strings.Split(inputStruct.Document, ",")[1] } + var base64PDFWithoutMime string + if RequiredToRepair(base64PDF) { + base64PDFWithoutMime, err = RepairPDF(base64PDF) + if err != nil { + return nil, fmt.Errorf("failed to repair PDF: %w", err) + } + } else { + base64PDFWithoutMime = base.TrimBase64Mime(base64PDF) + } + paramsJSON := map[string]interface{}{ - "PDF": base.TrimBase64Mime(base64PDF), + "PDF": base64PDFWithoutMime, "filename": inputStruct.Filename, } diff --git a/operator/document/v0/execution/pdf_checker.py b/operator/document/v0/execution/pdf_checker.py new file mode 100644 index 00000000..a2be8157 --- /dev/null +++ b/operator/document/v0/execution/pdf_checker.py @@ -0,0 +1,21 @@ +from io import BytesIO +import json +import base64 +import sys + +# TODO: Deal with the import error when running the code in the docker container +# from pdf_to_markdown import PDFTransformer + +if __name__ == "__main__": + json_str = sys.stdin.buffer.read().decode('utf-8') + params = json.loads(json_str) + pdf_string = params["PDF"] + + decoded_bytes = base64.b64decode(pdf_string) + pdf_file_obj = BytesIO(decoded_bytes) + pdf = PDFTransformer(x=pdf_file_obj) + pages = pdf.raw_pages + output = { + "required": len(pages) == 0, + } + print(json.dumps(output)) diff --git a/operator/document/v0/helper.go b/operator/document/v0/helper.go new file mode 100644 index 00000000..9d98274a --- /dev/null +++ b/operator/document/v0/helper.go @@ -0,0 +1,43 @@ +package document + +import ( + "encoding/json" + "log" + + "github.com/instill-ai/component/base" + "github.com/instill-ai/component/internal/util" +) + +func RequiredToRepair(pdfBase64 string) bool { + + paramsJSON := map[string]interface{}{ + "PDF": base.TrimBase64Mime(pdfBase64), + } + + pythonCode := pdfTransformer + pdfChecker + + outputBytes, err := util.ExecutePythonCode(pythonCode, paramsJSON) + + if err != nil { + // It shouldn't block the original process. + log.Println("failed to run python script: %w", err) + return false + } + + var output struct { + Repair bool `json:"required"` + } + + err = json.Unmarshal(outputBytes, &output) + + if err != nil { + // It shouldn't block the original process. + log.Println("failed to unmarshal output: %w", err) + } + + return output.Repair +} + +func RepairPDF(pdfBase64 string) (string, error) { + return ConvertToPDF(pdfBase64, "pdf") +} diff --git a/operator/document/v0/main.go b/operator/document/v0/main.go index d409bf5c..96e6156b 100644 --- a/operator/document/v0/main.go +++ b/operator/document/v0/main.go @@ -36,6 +36,9 @@ var ( //go:embed execution/task_convert_to_images.py taskConvertToImagesExecution string + //go:embed execution/pdf_checker.py + pdfChecker string + once sync.Once comp *component ) diff --git a/operator/document/v0/pdf_to_markdown_converter.go b/operator/document/v0/pdf_to_markdown_converter.go index 68117ac2..308144f6 100644 --- a/operator/document/v0/pdf_to_markdown_converter.go +++ b/operator/document/v0/pdf_to_markdown_converter.go @@ -19,8 +19,19 @@ type converterOutput struct { func convertPDFToMarkdownWithPDFPlumber(base64Text string, displayImageTag bool, displayAllPage bool) (converterOutput, error) { + var pdfBase64 string + var err error + if RequiredToRepair(base64Text) { + pdfBase64, err = RepairPDF(base64Text) + if err != nil { + return converterOutput{}, fmt.Errorf("failed to repair PDF: %w", err) + } + } else { + pdfBase64 = base.TrimBase64Mime(base64Text) + } + paramsJSON, err := json.Marshal(map[string]interface{}{ - "PDF": base.TrimBase64Mime(base64Text), + "PDF": pdfBase64, "display-image-tag": displayImageTag, "display-all-page-image": displayAllPage, }) From a965e458158ba9aad14422683f524dc124d47b8f Mon Sep 17 00:00:00 2001 From: chuang8511 Date: Fri, 27 Sep 2024 12:32:49 +0100 Subject: [PATCH 2/3] chore: clarify comments --- operator/document/v0/execution/pdf_checker.py | 4 +++- operator/document/v0/execution/task_convert_to_images.py | 4 +++- operator/document/v0/execution/task_convert_to_markdown.py | 4 +++- operator/document/v0/pdf_to_markdown/pdf_transformer.py | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/operator/document/v0/execution/pdf_checker.py b/operator/document/v0/execution/pdf_checker.py index a2be8157..de596d35 100644 --- a/operator/document/v0/execution/pdf_checker.py +++ b/operator/document/v0/execution/pdf_checker.py @@ -3,7 +3,9 @@ import base64 import sys -# TODO: Deal with the import error when running the code in the docker container +# TODO chuang8511: +# Deal with the import error when running the code in the docker container. +# Now, we combine all python code into one file to avoid the import error. # from pdf_to_markdown import PDFTransformer if __name__ == "__main__": diff --git a/operator/document/v0/execution/task_convert_to_images.py b/operator/document/v0/execution/task_convert_to_images.py index aadd0457..6ab77416 100644 --- a/operator/document/v0/execution/task_convert_to_images.py +++ b/operator/document/v0/execution/task_convert_to_images.py @@ -3,7 +3,9 @@ import base64 import sys -# TODO: Deal with the import error when running the code in the docker container +# TODO chuang8511: +# Deal with the import error when running the code in the docker container. +# Now, we combine all python code into one file to avoid the import error. # from pdf_to_markdown import PDFTransformer # from pdf_to_markdown import PageImageProcessor diff --git a/operator/document/v0/execution/task_convert_to_markdown.py b/operator/document/v0/execution/task_convert_to_markdown.py index fb3d17a2..a7e6b70e 100644 --- a/operator/document/v0/execution/task_convert_to_markdown.py +++ b/operator/document/v0/execution/task_convert_to_markdown.py @@ -3,7 +3,9 @@ import base64 import sys -# TODO: Deal with the import error when running the code in the docker container +# TODO chuang8511: +# Deal with the import error when running the code in the docker container. +# Now, we combine all python code into one file to avoid the import error. # from pdf_to_markdown import PDFTransformer diff --git a/operator/document/v0/pdf_to_markdown/pdf_transformer.py b/operator/document/v0/pdf_to_markdown/pdf_transformer.py index 58a5e940..cf5cead2 100644 --- a/operator/document/v0/pdf_to_markdown/pdf_transformer.py +++ b/operator/document/v0/pdf_to_markdown/pdf_transformer.py @@ -5,7 +5,9 @@ import pdfplumber from pdfplumber.page import Page -# TODO: Deal with the import error when running the code in the docker container +# TODO chuang8511: +# Deal with the import error when running the code in the docker container. +# Now, we combine all python code into one file to avoid the import error. # from page_image_processor import PageImageProcessor From f2b9fb3814ad0986a681a2bc4da04053aaa0fde9 Mon Sep 17 00:00:00 2001 From: chuang8511 Date: Fri, 27 Sep 2024 19:54:07 +0100 Subject: [PATCH 3/3] fix: fix the bug from deploying container --- operator/document/v0/markdown_transformer.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/operator/document/v0/markdown_transformer.go b/operator/document/v0/markdown_transformer.go index b36c2abd..c43ba9f1 100644 --- a/operator/document/v0/markdown_transformer.go +++ b/operator/document/v0/markdown_transformer.go @@ -275,7 +275,15 @@ func ConvertToPDF(base64Encoded, fileExtension string) (string, error) { base64PDF, err := encodeFileToBase64(tempPDFName) if err != nil { - return "", fmt.Errorf("failed to encode file to base64: %w", err) + // In the different containers, we have the different versions of LibreOffice, which means the behavior of LibreOffice may be different. + // So, we need to handle the case when the generated PDF is not in the temp directory. + if fileExtension == "pdf" { + base64PDF, err := encodeFileToBase64(inputFileName) + if err != nil { + return "", fmt.Errorf("failed to encode file to base64: %w", err) + } + return base64PDF, nil + } } return base64PDF, nil }