From 64a2ad31b9feb25a95bedcea56138580148dea24 Mon Sep 17 00:00:00 2001 From: danblooomberg Date: Tue, 14 Jan 2025 16:17:07 -0800 Subject: [PATCH] In pixFindBaselines(), remove baseline if no textbox is found for it * This is in relation to Issue #766. * If no textbox is found, we do not know the end points of the baseline. It is almost certainly very short, so it is removed from output. * Change order of operation: for each baseline, save all textboxes that describe text at that y-location. There can be multiple textboxes for each baseline if the line of text has large horizontal breaks. * As a result of this change, all reported baselines have x-value endpoints of text that can optionally be returned. --- prog/baseline1.png | Bin 0 -> 2366 bytes prog/baseline_reg.c | 22 +++++++++++++--- src/baseline.c | 63 +++++++++++++++++++++++++------------------- version-notes.html | 2 +- 4 files changed, 55 insertions(+), 32 deletions(-) create mode 100644 prog/baseline1.png diff --git a/prog/baseline1.png b/prog/baseline1.png new file mode 100644 index 0000000000000000000000000000000000000000..bcc511780b82d59aee906641e5d6b6f45fce9462 GIT binary patch literal 2366 zcmeAS@N?(olHy`uVBq!ia0y~yV7bn~z%hdnD8O*XuEqsOaTa()7K8X;FgHof2&m$N zr;B4q#hka-eY0){NH|=aqNgCOQ6ViWvc)9W>cxg=trW()PSYnOvbnum5+*CjUYMC8 zc}})g&gR}mo!8GF?znlZJ(_#n@oP!h8@Jww%F*2>7Tvw}=qPtIG)5D{XkHjC3Pwx9 zp;Z!Qr&-(F*~g!wiJE!WW^a4^_+#v^pEf?9-@OTVb3ba!q_?*dXFR)m$yNL2zW&1| z@2BZSo!vGcBw&2|$H)52xbI&!&EFRB=j-$Tm&-T({kQ$>C*82B_4PCLcy4cA3al&F z?%uSw_IK=7?%SXLz5h^}dNMUUSNg|BeU9bQ_kUFGH+s5g=iBQqYpiP|uWdWCJMXv2 zn%tS?{@di$Z||_Z{W#_Jou6j&_IC4b7o3|MS8J1VqwLOx+|$Qdisplay); @@ -133,11 +133,11 @@ L_REGPARAMS *rp; numaDestroy(&na); ptaDestroy(&pta); - /* Another test for baselines, with bogus short 'textblock' */ + /* Another test for baselines: very short textblock is removed */ pixadb = pixaCreate(6); - pix1 = pixRead("baseline2.png"); + pix1 = pixRead("baseline1.png"); na = pixFindBaselines(pix1, &pta, pixadb); - regTestCompareValues(rp, 3, numaGetCount(na), 0); /* 11 */ + regTestCompareValues(rp, 1, numaGetCount(na), 0); /* 11 */ pix2 = pixaDisplayTiledInRows(pixadb, 32, 1500, 1.0, 0, 30, 2); regTestWritePixAndCheck(rp, pix2, IFF_PNG); /* 12 */ pixDisplayWithTitle(pix2, 1400, 500, NULL, rp->display); @@ -145,6 +145,20 @@ L_REGPARAMS *rp; pixDestroy(&pix1); pixDestroy(&pix2); numaDestroy(&na); + ptaDestroy(&pta); + + /* Another test for baselines: bogus short-height 'textblock' */ + pixadb = pixaCreate(6); + pix1 = pixRead("baseline2.png"); + na = pixFindBaselines(pix1, &pta, pixadb); + regTestCompareValues(rp, 3, numaGetCount(na), 0); /* 13 */ + pix2 = pixaDisplayTiledInRows(pixadb, 32, 1500, 1.0, 0, 30, 2); + regTestWritePixAndCheck(rp, pix2, IFF_PNG); /* 14 */ + pixDisplayWithTitle(pix2, 1400, 500, NULL, rp->display); + pixaDestroy(&pixadb); + pixDestroy(&pix1); + pixDestroy(&pix2); + numaDestroy(&na); ptaDestroy(&pta); return regTestCleanup(rp); diff --git a/src/baseline.c b/src/baseline.c index 612da0593..9a2069524 100644 --- a/src/baseline.c +++ b/src/baseline.c @@ -120,7 +120,7 @@ pixFindBaselines(PIX *pixs, { l_int32 h, i, j, nbox, val1, val2, ndiff, bx, by, bw, bh; l_int32 imaxloc, peakthresh, zerothresh, inpeak; -l_int32 mintosearch, max, maxloc, nloc, locval; +l_int32 mintosearch, max, maxloc, nloc, locval, found, nremoved; l_int32 *array; l_float32 maxval; BOXA *boxa1, *boxa2, *boxa3; @@ -217,8 +217,9 @@ PTA *pta; numaDestroy(&naval); /* Generate an approximate profile of text line width. - * First, filter the boxes of text, where there may be - * more than one box for a given textline. */ + * First, consolidate and filter the boxes of text. + * The horizontal opening 'o30.1' removes lines of width + * less than 120 pixels at full resolution. */ pix2 = pixMorphSequence(pix1, "r11 + c20.1 + o30.1", 0); if (pixadb) pixaAddPix(pixadb, pix2, L_COPY); boxa1 = pixConnComp(pix2, NULL, 4); @@ -235,37 +236,45 @@ PTA *pta; boxaDestroy(&boxa1); boxaDestroy(&boxa2); - /* Optionally, find the baseline segments */ - pta = NULL; - if (ppta) { - pta = ptaCreate(0); - *ppta = pta; - } - if (pta) { - nloc = numaGetCount(naloc); - nbox = boxaGetCount(boxa3); - /* For each textbox, find the corresponding baseline. - * There may be more than one textbox to a baseline. - * Bogus textboxes of very small height may have been - * generated, and these are removed. Bogus textboxes can - * also be eliminated if the bottom is too far from any of - * the baselines. Note that the boxes are an expansion from - * 4x reduction, so box parameters are multiples of 4. */ - for (i = 0; i < nbox; i++) { - boxaGetBoxGeometry(boxa3, i, &bx, &by, &bw, &bh); - if (bh <= 8) continue; - for (j = 0; j < nloc; j++) { - numaGetIValue(naloc, j, &locval); - if (L_ABS(locval - (by + bh)) > 24) - continue; + /* For each baseline, find the corresponding textboxes. + * There may be more than one textbox to a baseline. + * Bogus textboxes of very small height may have been + * generated, and these are removed. Bogus textboxes can + * also be eliminated if the bottom is too far from any of + * the baselines. If there are no valid textboxes for a + * baseline, that baseline is removed. + * Note that the boxes have been expanded from 4x reduction, + * so box parameters are multiples of 4. */ + pta = ptaCreate(0); + nloc = numaGetCount(naloc); + nbox = boxaGetCount(boxa3); + nremoved = 0; /* keeps track of baselines removed */ + for (i = 0; i < nloc; i++) { + numaGetIValue(naloc, i, &locval); + found = FALSE; + for (j = 0; j < nbox; j++) { + boxaGetBoxGeometry(boxa3, j, &bx, &by, &bw, &bh); + if (bh > 8 && L_ABS(locval - (by + bh)) <= 24) { ptaAddPt(pta, bx, locval); ptaAddPt(pta, bx + bw, locval); - break; + found = TRUE; } } + if (!found) { /* no textbox corresponding to this baseline */ + L_INFO("short baseline %d at y = %d removed\n", __func__, + i + nremoved, locval); + numaRemoveNumber(naloc, i); + nremoved++; + i--; + nloc--; + } } boxaDestroy(&boxa3); + if (ppta) + *ppta = pta; + else + ptaDestroy(&pta); if (pixadb && pta) { /* display baselines */ l_int32 npts, x1, y1, x2, y2; pix1 = pixConvertTo32(pixs); diff --git a/version-notes.html b/version-notes.html index 1a8a613a3..b4a68f490 100644 --- a/version-notes.html +++ b/version-notes.html @@ -98,7 +98,7 @@

* Source files changed: baseline.c, pageseg.c, pdfapp.c, psio1.c * Prog files changed: baseline_reg.c, binarizefiles.c, compresspdf.c, croppdf.c, misctest2.c, - * Prog files added: baseline2.png + * Prog files added: baseline1.png, baseline2.png 1.85.0 Oct 16, 2024 * Use wrapper callSystemDebug() instead of system() in programs.