From 3f52e82eaa741cd2c8a08e8398ed6f4b3f65c419 Mon Sep 17 00:00:00 2001 From: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:56:13 -0600 Subject: [PATCH] fix: Updates to hOCR Template to follow hOCR Spec (#195) - Added validation in testing with https://github.com/kba/hocr-spec-python --- .../templates/hocr_document_template.xml.j2 | 7 +- .../test_convert_document_to_hocr_sample.py | 6 +- setup.py | 1 + testing/constraints-3.10.txt | 1 + testing/constraints-3.11.txt | 1 + testing/constraints-3.7.txt | 1 + testing/constraints-3.8.txt | 1 + testing/constraints-3.9.txt | 1 + .../resources/toolbox_invoice_test_0_hocr.xml | 67 ++++++++++--------- tests/unit/test_document.py | 12 +++- 10 files changed, 60 insertions(+), 38 deletions(-) diff --git a/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 b/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 index 63db0ada..dad071e1 100644 --- a/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 +++ b/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 @@ -6,8 +6,9 @@ + - + {% for page in pages -%} @@ -16,13 +17,13 @@ {% set bidx = loop.index0 -%} {% for paragraph in docai_block.paragraphs -%} {% set paridx = loop.index0 -%} - {% for line in paragraph.lines -%} +

{% for line in paragraph.lines -%} {% set lidx = loop.index0 -%} {{ line.text }}{% for token in line.tokens -%} {% set tidx = loop.index0 -%} {{ token.text }}{% endfor -%} {% endfor -%} - {% endfor -%} +

{% endfor -%}
{% endfor -%} {% endfor -%} diff --git a/samples/snippets/test_convert_document_to_hocr_sample.py b/samples/snippets/test_convert_document_to_hocr_sample.py index e3ed9f2b..776c0b96 100644 --- a/samples/snippets/test_convert_document_to_hocr_sample.py +++ b/samples/snippets/test_convert_document_to_hocr_sample.py @@ -24,7 +24,11 @@ def test_convert_document_to_hocr_sample() -> None: document_path=document_path, document_title=document_title ) - with open("../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f: + with open( + "../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", + "r", + encoding="utf-8", + ) as f: expected = f.read() assert actual == expected diff --git a/setup.py b/setup.py index 7a29932e..abece197 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ "immutabledict >= 2.0.0, < 3.0.0dev; python_version<'3.8'", "Pillow >= 9.5.0, < 11.0.0", "Jinja2 >= 3.1.0, <= 4.0.0", + "hocr-spec >= 0.2.0", ), python_requires=">=3.7", classifiers=[ diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.11.txt +++ b/testing/constraints-3.11.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 3c64ab2e..0a9af7ff 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -14,3 +14,4 @@ google-cloud-documentai==2.20.0 google-cloud-storage==2.7.0 numpy==1.19.5 pikepdf==6.2.9 +hocr-spec==0.2.0 diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt index ed1905e2..a9d4c497 100644 --- a/testing/constraints-3.8.txt +++ b/testing/constraints-3.8.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy==1.21.6 pikepdf==8.2.3 +hocr-spec diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/tests/unit/resources/toolbox_invoice_test_0_hocr.xml b/tests/unit/resources/toolbox_invoice_test_0_hocr.xml index 0cd8e171..4e265f7d 100644 --- a/tests/unit/resources/toolbox_invoice_test_0_hocr.xml +++ b/tests/unit/resources/toolbox_invoice_test_0_hocr.xml @@ -6,84 +6,85 @@ + - + -
Invoice +

Invoice Invoice -DATE: 01/01/1970 +

DATE: 01/01/1970 DATE: 01/01/1970 INVOICE: NO. 001 INVOICE: NO. 001 -FROM: Company ABC +

FROM: Company ABC FROM: Company ABC user@companyabc.com user@companyabc.com -TO: John Doe +

TO: John Doe TO: John Doe johndoe@email.com johndoe@email.com -ADDRESS: 111 Main Street +

ADDRESS: 111 Main Street ADDRESS: 111 Main Street Anytown, USA Anytown, USA -ADDRESS: 222 Main Street +

ADDRESS: 222 Main Street ADDRESS: 222 Main Street Anytown, USA Anytown, USA -TERMS: 6 month contract +

TERMS: 6 month contract TERMS: 6 month contract DUE: 01/01/2025 DUE: 01/01/2025 -Item Description +

Item Description Item Description -Quantity +

Quantity Quantity -Price +

Price Price -Amount +

Amount Amount -Tool A +

Tool A Tool A -500 +

500 500 -$1.00 +

$1.00 $1.00 -$500.00 +

$500.00 $500.00 -Service B +

Service B Service B -1 +

1 1 -$900.00 +

$900.00 $900.00 -$900.00 +

$900.00 $900.00 -Resource C +

Resource C Resource C -50 +

50 50 -$12.00 +

$12.00 $12.00 -$600.00 +

$600.00 $600.00 -Subtotal +

Subtotal Subtotal -$2000.00 +

$2000.00 $2000.00 -Tax +

Tax Tax -$140.00 +

$140.00 $140.00 -BALANCE DUE +

BALANCE DUE BALANCE DUE -$2140.00 +

$2140.00 $2140.00 -NOTES: +

NOTES: NOTES: -Supplies used for Project Q. +

Supplies used for Project Q. Supplies used for Project Q. -

+

\ No newline at end of file diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index d988e1e0..86366f27 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from io import BytesIO import json import os import shutil @@ -32,6 +33,8 @@ from google.cloud import documentai from google.cloud.documentai_toolbox import document, gcs_utilities +from hocr_spec import HocrValidator + def get_bytes(file_name): result = [] @@ -689,8 +692,15 @@ def test_export_hocr_str(): ) actual_hocr = wrapped_document.export_hocr_str(title="toolbox_invoice_test-0") + assert actual_hocr + validator = HocrValidator(profile="standard") + report = validator.validate(BytesIO(actual_hocr.encode("utf-8")), parse_strict=True) + + assert report.format("bool") - with open("tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f: + with open( + "tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r", encoding="utf-8" + ) as f: expected = f.read() assert actual_hocr == expected