Skip to content

Commit

Permalink
fix: Updates to hOCR Template to follow hOCR Spec
Browse files Browse the repository at this point in the history
- Added validation in testing with https://github.com/kba/hocr-spec-python
  • Loading branch information
holtskinner committed Nov 7, 2023
1 parent e05cf50 commit 8d32327
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="ocr-system" content="Document AI OCR" />
<meta name="ocr-langs" content="unknown" />
<meta name="ocr-scripts" content="unknown" />
<meta name="ocr-number-of-pages" content="{{ pages|length }}" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
<meta name="ocr-capabilities" content="ocrp_lang ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
{% for page in pages -%}
Expand All @@ -16,13 +17,13 @@
{% set bidx = loop.index0 -%}
<span class='ocr_carea' id='block_{{ page_number }}_{{ bidx }}' title='{{ docai_block.hocr_bounding_box -}}'>{% for paragraph in docai_block.paragraphs -%}
{% set paridx = loop.index0 -%}
<span class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
<p class='ocr_par' id='par_{{ page_number }}_{{ bidx }}_{{ paridx }}' title='{{ paragraph.hocr_bounding_box -}}'>{% for line in paragraph.lines -%}
{% set lidx = loop.index0 -%}
<span class='ocr_line' id='line_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}' title='{{ line.hocr_bounding_box }}'>{{ line.text }}{% for token in line.tokens -%}
{% set tidx = loop.index0 -%}
<span class='ocrx_word' id='word_{{ page_number }}_{{ bidx }}_{{ paridx }}_{{ lidx }}_{{ tidx }}' title='{{ token.hocr_bounding_box }}'>{{ token.text }}</span>{% endfor -%}
</span>{% endfor -%}
</span>{% endfor -%}
</p>{% endfor -%}
</span>{% endfor -%}
</div>
{% endfor -%}
Expand Down
1 change: 1 addition & 0 deletions testing/constraints-3.7.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ google-cloud-documentai==2.20.0
google-cloud-storage==2.7.0
numpy==1.19.5
pikepdf==6.2.9
hocr-spec==0.2.0
Loading

0 comments on commit 8d32327

Please sign in to comment.