diff --git a/prog/baseline2.png b/prog/baseline2.png new file mode 100644 index 000000000..de80f1713 Binary files /dev/null and b/prog/baseline2.png differ diff --git a/prog/baseline_reg.c b/prog/baseline_reg.c index 307a13f11..d00e65289 100644 --- a/prog/baseline_reg.c +++ b/prog/baseline_reg.c @@ -131,6 +131,20 @@ L_REGPARAMS *rp; pixDestroy(&pix2); pixDestroy(&pix5); numaDestroy(&na); + ptaDestroy(&pta); + + /* Another test for baselines, with bogus short 'textblock' */ + pixadb = pixaCreate(6); + pix1 = pixRead("baseline2.png"); + na = pixFindBaselines(pix1, &pta, pixadb); + regTestCompareValues(rp, 3, numaGetCount(na), 0); /* 11 */ + pix2 = pixaDisplayTiledInRows(pixadb, 32, 1500, 1.0, 0, 30, 2); + regTestWritePixAndCheck(rp, pix2, IFF_PNG); /* 12 */ + pixDisplayWithTitle(pix2, 1400, 500, NULL, rp->display); + pixaDestroy(&pixadb); + pixDestroy(&pix1); + pixDestroy(&pix2); + numaDestroy(&na); ptaDestroy(&pta); return regTestCleanup(rp); diff --git a/src/baseline.c b/src/baseline.c index d2bf30acc..612da0593 100644 --- a/src/baseline.c +++ b/src/baseline.c @@ -242,19 +242,27 @@ PTA *pta; *ppta = pta; } if (pta) { - nloc = numaGetCount(naloc); - nbox = boxaGetCount(boxa3); - for (i = 0; i < nbox; i++) { - boxaGetBoxGeometry(boxa3, i, &bx, &by, &bw, &bh); - for (j = 0; j < nloc; j++) { - numaGetIValue(naloc, j, &locval); - if (L_ABS(locval - (by + bh)) > 25) - continue; - ptaAddPt(pta, bx, locval); - ptaAddPt(pta, bx + bw, locval); - break; - } - } + nloc = numaGetCount(naloc); + nbox = boxaGetCount(boxa3); + /* For each textbox, find the corresponding baseline. + * There may be more than one textbox to a baseline. + * Bogus textboxes of very small height may have been + * generated, and these are removed. Bogus textboxes can + * also be eliminated if the bottom is too far from any of + * the baselines. Note that the boxes are an expansion from + * 4x reduction, so box parameters are multiples of 4. */ + for (i = 0; i < nbox; i++) { + boxaGetBoxGeometry(boxa3, i, &bx, &by, &bw, &bh); + if (bh <= 8) continue; + for (j = 0; j < nloc; j++) { + numaGetIValue(naloc, j, &locval); + if (L_ABS(locval - (by + bh)) > 24) + continue; + ptaAddPt(pta, bx, locval); + ptaAddPt(pta, bx + bw, locval); + break; + } + } } boxaDestroy(&boxa3); diff --git a/version-notes.html b/version-notes.html index 00c0f82c8..1a8a613a3 100644 --- a/version-notes.html +++ b/version-notes.html @@ -90,11 +90,15 @@
1.86.0 Not released + * Modify pixFindBaselines() to avoid joining textboxes and to + ignore bogus textboxes when listing baseline end points. + * Modify convertToPSEmbed() to efficiently encode webp input images. * Modify compressFilesToPdf() to allow upscale interpolation for low resolution pdfs. - * Source files changed: pageseg.c, pdfapp.c - * Prog files changed: binarizefiles.c, compresspdf.c, croppdf.c, - misctest2.c + * Source files changed: baseline.c, pageseg.c, pdfapp.c, psio1.c + * Prog files changed: baseline_reg.c, binarizefiles.c, + compresspdf.c, croppdf.c, misctest2.c, + * Prog files added: baseline2.png 1.85.0 Oct 16, 2024 * Use wrapper callSystemDebug() instead of system() in programs.