Skip to content

Commit

Permalink
fix: out-of-range vector error (#15)
Browse files Browse the repository at this point in the history
* fix out-of-range vector error

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformat code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM authored Aug 29, 2024
1 parent 9c8cccc commit 4ed034c
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 23 deletions.
15 changes: 11 additions & 4 deletions docling_parse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def main():

# Load the document
success = parser.load_document(doc_key, doc_file)
# parser.set_loglevel(args.log_level)

# Get number of pages
num_pages = parser.number_of_pages(doc_key)
Expand All @@ -56,7 +57,10 @@ def main():
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)

if "pages" not in json_doc: # page could not get parsed
print(f"ERROR: page {page} is not parsed ... ")
continue
else:
print(f"page {page} is parsed ... ")

json_page = json_doc["pages"][0]

Expand All @@ -80,10 +84,13 @@ def main():
]
)

print(f"cells of page: {page}")
print(
tabulate(cells, headers=["page", "cell-id", "text", "x0", "y0", "x1", "y1"])
)
if False:
print(f"cells of page: {page}")
print(
tabulate(
cells, headers=["page", "cell-id", "text", "x0", "y0", "x1", "y1"]
)
)

# find bitmap images
images = []
Expand Down
52 changes: 34 additions & 18 deletions src/proj_folders/pdf_parser/post_processors/build_hv_lines.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,16 @@ namespace pdf_lib
std::vector<scalar_type>& x,
std::vector<scalar_type>& y)
{
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

if(j+1<x.size())
return false;

scalar_type x0 = x[j+0];
scalar_type y0 = y[j+0];
scalar_type x0 = x.at(j+0);
scalar_type y0 = y.at(j+0);

scalar_type x1 = x[j+1];
scalar_type y1 = y[j+1];
scalar_type x1 = x.at(j+1);
scalar_type y1 = y.at(j+1);

if(std::abs(x1-x0)>1.e-3 and
std::abs(y1-y0)<1.e-3)
Expand All @@ -144,16 +144,16 @@ namespace pdf_lib
std::vector<scalar_type>& x,
std::vector<scalar_type>& y)
{
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

if(j+1<x.size())
return false;

scalar_type x0 = x[j+0];
scalar_type y0 = y[j+0];
scalar_type x0 = x.at(j+0);
scalar_type y0 = y.at(j+0);

scalar_type x1 = x[j+1];
scalar_type y1 = y[j+1];
scalar_type x1 = x.at(j+1);
scalar_type y1 = y.at(j+1);

if(std::abs(x1-x0)<1.e-3 and
std::abs(y1-y0)>1.e-3)
Expand All @@ -167,7 +167,7 @@ namespace pdf_lib
scalar_type x1, scalar_type y1,
std::vector<horizontal_line<scalar_type> >& hlines_)
{
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

horizontal_line<scalar_type> hline;
hline.y = y0;
Expand All @@ -183,7 +183,7 @@ namespace pdf_lib
scalar_type x1, scalar_type y1,
std::vector<vertical_line<scalar_type> >& vlines_)
{
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

vertical_line<scalar_type> vline;
vline.x = x0;
Expand All @@ -199,7 +199,7 @@ namespace pdf_lib
std::vector<vertical_line <scalar_type> >& vlines_,
std::vector<horizontal_line<scalar_type> >& hlines_)
{
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;

hlines_.clear();
vlines_.clear();
Expand All @@ -213,15 +213,31 @@ namespace pdf_lib
x <= paths[k][core::keys<core::PATH>::x_values()];
y <= paths[k][core::keys<core::PATH>::y_values()];

/*
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t #-subpaths: " << subpaths.size();
for(int i=0; i<subpaths.size(); i++)
{
logging_lib::info("pdf-parser") << i << "\t" << subpaths.at(i);
}
*/

for(int i=0; i<subpaths.size()-1; i++)
{
for(int j=subpaths[i+0]; j<subpaths[i+1]; j++)
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t subpath (" << i << "): " << subpaths.size();
for(int j=subpaths.at(i+0); j<subpaths.at(i+1); j++)
{
scalar_type x0 = x[j+0];
scalar_type y0 = y[j+0];
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t x/y: " << j << "/" << x.size() << ":" << y.size();

if(j+1>=x.size() or j+1>=y.size()) // skip
{
continue;
}

scalar_type x0 = x.at(j+0);
scalar_type y0 = y.at(j+0);

scalar_type x1 = x[j+1];
scalar_type y1 = y[j+1];
scalar_type x1 = x.at(j+1);
scalar_type y1 = y.at(j+1);

if(std::abs(y1-y0)<1.e-3 and std::abs(x1-x0)>1.e-3)
register_hline(x0, y0, x1, y1, hlines_);
Expand Down
11 changes: 10 additions & 1 deletion src/proj_folders/pdf_parser/post_processors/split_textcells.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,13 @@ namespace pdf_lib
while(splitting)
{
splitting=false;

/*
for(int j=0; j<vlines.size(); j+=1)
{
logging_lib::info("pdf-parser") << "vline (" << j << "): " << vlines[j].x << "," << vlines[j].y0 << "," << vlines[j].y1;
}
*/

for(int i=0; i<cells.get_size(); i+=1)
{
Expand All @@ -530,13 +537,15 @@ namespace pdf_lib
//auto height = bbox.height();
//auto width = bbox.width();

//logging_lib::info("pdf-parser") << "\t cell \""<< cell_m.text << "\"";
//logging_lib::info("pdf-parser") << cell_m.text << "\t" << x0 << "," << y0 << "," << x1 << "," << y1 << "\n";
for(int j=0; j<vlines.size(); j+=1)
{
//if((x0+0.00*width < vlines[j].x and vlines[j].x < x1-0.00*width) and
//(vlines[j].y0 < y0+0.05*height and y1-0.05*height < vlines[j].y1))
if(post_processor<BUILD_HV_LINES, scalar_type>::is_split_by_vline(x0, y0, x1, y1, vlines[j]))
{
//logging_lib::info("pdf-parser") << "vline: " << vlines[j].x << "," << vlines[j].y0 << "," << vlines[j].y1;

logging_lib::warn("pdf-parser") << "\t --> splitting cell \""<< cell_m.text << "\"";
splitting=split_cell_by_vline_on_page(i, cells, vlines[j]);
}
Expand Down

0 comments on commit 4ed034c

Please sign in to comment.