From 3f52e82eaa741cd2c8a08e8398ed6f4b3f65c419 Mon Sep 17 00:00:00 2001 From: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:56:13 -0600 Subject: [PATCH] fix: Updates to hOCR Template to follow hOCR Spec (#195) - Added validation in testing with https://github.com/kba/hocr-spec-python --- .../templates/hocr_document_template.xml.j2 | 7 +- .../test_convert_document_to_hocr_sample.py | 6 +- setup.py | 1 + testing/constraints-3.10.txt | 1 + testing/constraints-3.11.txt | 1 + testing/constraints-3.7.txt | 1 + testing/constraints-3.8.txt | 1 + testing/constraints-3.9.txt | 1 + .../resources/toolbox_invoice_test_0_hocr.xml | 67 ++++++++++--------- tests/unit/test_document.py | 12 +++- 10 files changed, 60 insertions(+), 38 deletions(-) diff --git a/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 b/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 index 63db0ada..dad071e1 100644 --- a/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 +++ b/google/cloud/documentai_toolbox/templates/hocr_document_template.xml.j2 @@ -6,8 +6,9 @@ + - +
{% for page in pages -%} @@ -16,13 +17,13 @@ {% set bidx = loop.index0 -%} {% for paragraph in docai_block.paragraphs -%} {% set paridx = loop.index0 -%} - {% for line in paragraph.lines -%} +{% for line in paragraph.lines -%} {% set lidx = loop.index0 -%} {{ line.text }}{% for token in line.tokens -%} {% set tidx = loop.index0 -%} {{ token.text }}{% endfor -%} {% endfor -%} -
{% endfor -%} + {% endfor -%} {% endfor -%} {% endfor -%} diff --git a/samples/snippets/test_convert_document_to_hocr_sample.py b/samples/snippets/test_convert_document_to_hocr_sample.py index e3ed9f2b..776c0b96 100644 --- a/samples/snippets/test_convert_document_to_hocr_sample.py +++ b/samples/snippets/test_convert_document_to_hocr_sample.py @@ -24,7 +24,11 @@ def test_convert_document_to_hocr_sample() -> None: document_path=document_path, document_title=document_title ) - with open("../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", "r") as f: + with open( + "../../tests/unit/resources/toolbox_invoice_test_0_hocr.xml", + "r", + encoding="utf-8", + ) as f: expected = f.read() assert actual == expected diff --git a/setup.py b/setup.py index 7a29932e..abece197 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ "immutabledict >= 2.0.0, < 3.0.0dev; python_version<'3.8'", "Pillow >= 9.5.0, < 11.0.0", "Jinja2 >= 3.1.0, <= 4.0.0", + "hocr-spec >= 0.2.0", ), python_requires=">=3.7", classifiers=[ diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.10.txt +++ b/testing/constraints-3.10.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.11.txt +++ b/testing/constraints-3.11.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 3c64ab2e..0a9af7ff 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -14,3 +14,4 @@ google-cloud-documentai==2.20.0 google-cloud-storage==2.7.0 numpy==1.19.5 pikepdf==6.2.9 +hocr-spec==0.2.0 diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt index ed1905e2..a9d4c497 100644 --- a/testing/constraints-3.8.txt +++ b/testing/constraints-3.8.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy==1.21.6 pikepdf==8.2.3 +hocr-spec diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index c9f0e4bb..25aa22a8 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -11,3 +11,4 @@ google-cloud-documentai google-cloud-storage numpy pikepdf +hocr-spec diff --git a/tests/unit/resources/toolbox_invoice_test_0_hocr.xml b/tests/unit/resources/toolbox_invoice_test_0_hocr.xml index 0cd8e171..4e265f7d 100644 --- a/tests/unit/resources/toolbox_invoice_test_0_hocr.xml +++ b/tests/unit/resources/toolbox_invoice_test_0_hocr.xml @@ -6,84 +6,85 @@ + - + -Invoice Invoice -
DATE: 01/01/1970 +DATE: 01/01/1970 DATE: 01/01/1970 INVOICE: NO. 001 INVOICE: NO. 001 -
FROM: Company ABC +FROM: Company ABC FROM: Company ABC user@companyabc.com user@companyabc.com -
TO: John Doe +TO: John Doe TO: John Doe johndoe@email.com johndoe@email.com -
ADDRESS: 111 Main Street +ADDRESS: 111 Main Street ADDRESS: 111 Main Street Anytown, USA Anytown, USA -
ADDRESS: 222 Main Street +ADDRESS: 222 Main Street ADDRESS: 222 Main Street Anytown, USA Anytown, USA -
TERMS: 6 month contract +TERMS: 6 month contract TERMS: 6 month contract DUE: 01/01/2025 DUE: 01/01/2025 -
Item Description +Item Description Item Description -
Quantity +Quantity Quantity -
Price +Price Price -
Amount +Amount Amount -
Tool A +Tool A Tool A -
500 +500 500 -
$1.00 +$1.00 $1.00 -
$500.00 +$500.00 $500.00 -
Service B +Service B Service B -
1 +1 1 -
$900.00 +$900.00 $900.00 -
$900.00 +$900.00 $900.00 -
Resource C +Resource C Resource C -
50 +50 50 -
$12.00 +$12.00 $12.00 -
$600.00 +$600.00 $600.00 -
Subtotal +Subtotal Subtotal -
$2000.00 +$2000.00 $2000.00 -
Tax +Tax Tax -
$140.00 +$140.00 $140.00 -
BALANCE DUE +BALANCE DUE BALANCE DUE -
$2140.00 +$2140.00 $2140.00 -
NOTES: +NOTES: NOTES: -
Supplies used for Project Q. +Supplies used for Project Q. Supplies used for Project Q. -