From 488a68edbf7429f1cbe8a84c11afde39832433d7 Mon Sep 17 00:00:00 2001 From: Holt Skinner Date: Tue, 7 Nov 2023 13:02:21 -0600 Subject: [PATCH] fix: Add Backwards Compatibility for incorrect hOCR Format --- ...hocr_document_template_inline_words.xml.j2 | 29 ++++++ .../documentai_toolbox/wrappers/document.py | 13 ++- .../convert_document_to_hocr_sample.py | 4 +- ...olbox_invoice_test_0_hocr_inline_words.xml | 89 +++++++++++++++++++ tests/unit/test_document.py | 31 ++++++- 5 files changed, 160 insertions(+), 6 deletions(-) create mode 100644 google/cloud/documentai_toolbox/templates/hocr_document_template_inline_words.xml.j2 create mode 100644 tests/unit/resources/toolbox_invoice_test_0_hocr_inline_words.xml diff --git a/google/cloud/documentai_toolbox/templates/hocr_document_template_inline_words.xml.j2 b/google/cloud/documentai_toolbox/templates/hocr_document_template_inline_words.xml.j2 new file mode 100644 index 00000000..57c7e976 --- /dev/null +++ b/google/cloud/documentai_toolbox/templates/hocr_document_template_inline_words.xml.j2 @@ -0,0 +1,29 @@ + + + + +{{ title }} + + + + + + + +{% for page in pages -%} + {% set page_number = page.documentai_object.page_number -%} +
{% for docai_block in page.blocks -%} + {% set bidx = loop.index0 -%} + {% for paragraph in docai_block.paragraphs -%} + {% set paridx = loop.index0 -%} + {% for line in paragraph.lines -%} + {% set lidx = loop.index0 -%} + {{ line.text }}{% for token in line.tokens -%} + {% set tidx = loop.index0 -%} + {{ token.text }}{% endfor -%}{% endfor -%} + {% endfor -%} + {% endfor -%} +
+{% endfor -%} + + diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 69f893f2..03491871 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -790,7 +790,7 @@ def export_images( return output_filenames - def export_hocr_str(self, title: str) -> str: + def export_hocr_str(self, title: str, inline_words: bool = True) -> str: r"""Exports a string hOCR version of the Document. The format for the id of the object follows as such: @@ -802,6 +802,10 @@ def export_hocr_str(self, title: str) -> str: Args: title (str): Required. The title for hocr_page and head. + inline_words (str): + Optional. Include `ocrx_word` elements inline. + Default: True - For backwards compatibility only, set to `False` for all new implementations. + See https://github.com/googleapis/python-documentai-toolbox/issues/193 Returns: str: @@ -810,7 +814,12 @@ def export_hocr_str(self, title: str) -> str: environment = Environment( loader=PackageLoader("google.cloud.documentai_toolbox", "templates") ) - template = environment.get_template("hocr_document_template.xml.j2") + template_name = ( + "hocr_document_template_inline_words.xml.j2" + if inline_words + else "hocr_document_template.xml.j2" + ) + template = environment.get_template(template_name) content = template.render(pages=self.pages, title=title) return content diff --git a/samples/snippets/convert_document_to_hocr_sample.py b/samples/snippets/convert_document_to_hocr_sample.py index d89fe21f..b99301ee 100644 --- a/samples/snippets/convert_document_to_hocr_sample.py +++ b/samples/snippets/convert_document_to_hocr_sample.py @@ -28,7 +28,9 @@ def convert_document_to_hocr_sample(document_path: str, document_title: str) -> wrapped_document = document.Document.from_document_path(document_path=document_path) # Converting wrapped_document to hOCR format - hocr_string = wrapped_document.export_hocr_str(title=document_title) + hocr_string = wrapped_document.export_hocr_str( + title=document_title, inline_words=False + ) print("Document converted to hOCR!") return hocr_string diff --git a/tests/unit/resources/toolbox_invoice_test_0_hocr_inline_words.xml b/tests/unit/resources/toolbox_invoice_test_0_hocr_inline_words.xml new file mode 100644 index 00000000..6885961b --- /dev/null +++ b/tests/unit/resources/toolbox_invoice_test_0_hocr_inline_words.xml @@ -0,0 +1,89 @@ + + + + +toolbox_invoice_test-0 + + + + + + + +
Invoice +Invoice +DATE: 01/01/1970 +DATE: 01/01/1970 +INVOICE: NO. 001 +INVOICE: NO. 001 +FROM: Company ABC +FROM: Company ABC +user@companyabc.com +user@companyabc.com +TO: John Doe +TO: John Doe +johndoe@email.com +johndoe@email.com +ADDRESS: 111 Main Street +ADDRESS: 111 Main Street +Anytown, USA +Anytown, USA +ADDRESS: 222 Main Street +ADDRESS: 222 Main Street +Anytown, USA +Anytown, USA +TERMS: 6 month contract +TERMS: 6 month contract +DUE: 01/01/2025 +DUE: 01/01/2025 +Item Description +Item Description +Quantity +Quantity +Price +Price +Amount +Amount +Tool A +Tool A +500 +500 +$1.00 +$1.00 +$500.00 +$500.00 +Service B +Service B +1 +1 +$900.00 +$900.00 +$900.00 +$900.00 +Resource C +Resource C +50 +50 +$12.00 +$12.00 +$600.00 +$600.00 +Subtotal +Subtotal +$2000.00 +$2000.00 +Tax +Tax +$140.00 +$140.00 +BALANCE DUE +BALANCE DUE +$2140.00 +$2140.00 +NOTES: +NOTES: +Supplies used for Project Q. +Supplies used for Project Q. +
+ + \ No newline at end of file diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index d988e1e0..888326b0 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -688,9 +688,13 @@ def test_export_hocr_str(): document_path="tests/unit/resources/0/toolbox_invoice_test-0.json" ) - actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0") + actual_hocr = wrapped_document.export_hocr_str( + title="toolbox_invoice_test-0", inline_words=False + ) - with open("tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f: + with open( + "tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r", encoding="utf-8" + ) as f: expected = f.read() assert actual_hocr == expected @@ -701,10 +705,31 @@ def test_export_hocr_str_with_blank_document(): document_path="tests/unit/resources/blank_document.json" ) - actual_hocr = wrapped_document.export_hocr_str(title="hocr_blank") + actual_hocr = wrapped_document.export_hocr_str( + title="hocr_blank", inline_words=False + ) + + assert actual_hocr + + +def test_export_hocr_str_with_inline_words(): + wrapped_document = document.Document.from_document_path( + document_path="tests/unit/resources/0/toolbox_invoice_test-0.json" + ) + + actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0") assert actual_hocr + with open( + "tests/unit/resources/toolbox_invoice_test_0_hocr_inline_words.xml", + "r", + encoding="utf-8", + ) as f: + expected = f.read() + + assert actual_hocr == expected + def test_document_to_merged_documentai_document(get_bytes_multiple_files_mock): wrapped_document = document.Document.from_gcs(