Skip to content

Commit

Permalink
In pixFindBaselines(), remove baseline if no textbox is found for it
Browse files Browse the repository at this point in the history
* This is in relation to Issue #766.
* If no textbox is found, we do not know the end points of the baseline.
  It is almost certainly very short, so it is removed from output.
* Change order of operation: for each baseline, save all textboxes that
  describe text at that y-location.  There can be multiple textboxes
  for each baseline if the line of text has large horizontal breaks.
* As a result of this change, all reported baselines have x-value
  endpoints of text that can optionally be returned.
  • Loading branch information
DanBloomberg committed Jan 15, 2025
1 parent f9ef244 commit 64a2ad3
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 32 deletions.
Binary file added prog/baseline1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 18 additions & 4 deletions prog/baseline_reg.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ L_REGPARAMS *rp;
pixDestroy(&pix4);
pix1 = pixDeskew(pix5, 2);
na = pixFindBaselines(pix1, &pta, pixadb);
regTestCompareValues(rp, 35, numaGetCount(na), 0); /* 9 */
regTestCompareValues(rp, 33, numaGetCount(na), 0); /* 9 */
pix2 = pixaDisplayTiledInRows(pixadb, 32, 1500, 1.0, 0, 30, 2);
regTestWritePixAndCheck(rp, pix2, IFF_PNG); /* 10 */
pixDisplayWithTitle(pix2, 800, 500, NULL, rp->display);
Expand All @@ -133,18 +133,32 @@ L_REGPARAMS *rp;
numaDestroy(&na);
ptaDestroy(&pta);

/* Another test for baselines, with bogus short 'textblock' */
/* Another test for baselines: very short textblock is removed */
pixadb = pixaCreate(6);
pix1 = pixRead("baseline2.png");
pix1 = pixRead("baseline1.png");
na = pixFindBaselines(pix1, &pta, pixadb);
regTestCompareValues(rp, 3, numaGetCount(na), 0); /* 11 */
regTestCompareValues(rp, 1, numaGetCount(na), 0); /* 11 */
pix2 = pixaDisplayTiledInRows(pixadb, 32, 1500, 1.0, 0, 30, 2);
regTestWritePixAndCheck(rp, pix2, IFF_PNG); /* 12 */
pixDisplayWithTitle(pix2, 1400, 500, NULL, rp->display);
pixaDestroy(&pixadb);
pixDestroy(&pix1);
pixDestroy(&pix2);
numaDestroy(&na);
ptaDestroy(&pta);

/* Another test for baselines: bogus short-height 'textblock' */
pixadb = pixaCreate(6);
pix1 = pixRead("baseline2.png");
na = pixFindBaselines(pix1, &pta, pixadb);
regTestCompareValues(rp, 3, numaGetCount(na), 0); /* 13 */
pix2 = pixaDisplayTiledInRows(pixadb, 32, 1500, 1.0, 0, 30, 2);
regTestWritePixAndCheck(rp, pix2, IFF_PNG); /* 14 */
pixDisplayWithTitle(pix2, 1400, 500, NULL, rp->display);
pixaDestroy(&pixadb);
pixDestroy(&pix1);
pixDestroy(&pix2);
numaDestroy(&na);
ptaDestroy(&pta);

return regTestCleanup(rp);
Expand Down
63 changes: 36 additions & 27 deletions src/baseline.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ pixFindBaselines(PIX *pixs,
{
l_int32 h, i, j, nbox, val1, val2, ndiff, bx, by, bw, bh;
l_int32 imaxloc, peakthresh, zerothresh, inpeak;
l_int32 mintosearch, max, maxloc, nloc, locval;
l_int32 mintosearch, max, maxloc, nloc, locval, found, nremoved;
l_int32 *array;
l_float32 maxval;
BOXA *boxa1, *boxa2, *boxa3;
Expand Down Expand Up @@ -217,8 +217,9 @@ PTA *pta;
numaDestroy(&naval);

/* Generate an approximate profile of text line width.
* First, filter the boxes of text, where there may be
* more than one box for a given textline. */
* First, consolidate and filter the boxes of text.
* The horizontal opening 'o30.1' removes lines of width
* less than 120 pixels at full resolution. */
pix2 = pixMorphSequence(pix1, "r11 + c20.1 + o30.1", 0);
if (pixadb) pixaAddPix(pixadb, pix2, L_COPY);
boxa1 = pixConnComp(pix2, NULL, 4);
Expand All @@ -235,37 +236,45 @@ PTA *pta;
boxaDestroy(&boxa1);
boxaDestroy(&boxa2);

/* Optionally, find the baseline segments */
pta = NULL;
if (ppta) {
pta = ptaCreate(0);
*ppta = pta;
}
if (pta) {
nloc = numaGetCount(naloc);
nbox = boxaGetCount(boxa3);
/* For each textbox, find the corresponding baseline.
* There may be more than one textbox to a baseline.
* Bogus textboxes of very small height may have been
* generated, and these are removed. Bogus textboxes can
* also be eliminated if the bottom is too far from any of
* the baselines. Note that the boxes are an expansion from
* 4x reduction, so box parameters are multiples of 4. */
for (i = 0; i < nbox; i++) {
boxaGetBoxGeometry(boxa3, i, &bx, &by, &bw, &bh);
if (bh <= 8) continue;
for (j = 0; j < nloc; j++) {
numaGetIValue(naloc, j, &locval);
if (L_ABS(locval - (by + bh)) > 24)
continue;
/* For each baseline, find the corresponding textboxes.
* There may be more than one textbox to a baseline.
* Bogus textboxes of very small height may have been
* generated, and these are removed. Bogus textboxes can
* also be eliminated if the bottom is too far from any of
* the baselines. If there are no valid textboxes for a
* baseline, that baseline is removed.
* Note that the boxes have been expanded from 4x reduction,
* so box parameters are multiples of 4. */
pta = ptaCreate(0);
nloc = numaGetCount(naloc);
nbox = boxaGetCount(boxa3);
nremoved = 0; /* keeps track of baselines removed */
for (i = 0; i < nloc; i++) {
numaGetIValue(naloc, i, &locval);
found = FALSE;
for (j = 0; j < nbox; j++) {
boxaGetBoxGeometry(boxa3, j, &bx, &by, &bw, &bh);
if (bh > 8 && L_ABS(locval - (by + bh)) <= 24) {
ptaAddPt(pta, bx, locval);
ptaAddPt(pta, bx + bw, locval);
break;
found = TRUE;
}
}
if (!found) { /* no textbox corresponding to this baseline */
L_INFO("short baseline %d at y = %d removed\n", __func__,
i + nremoved, locval);
numaRemoveNumber(naloc, i);
nremoved++;
i--;
nloc--;
}
}
boxaDestroy(&boxa3);

if (ppta)
*ppta = pta;
else
ptaDestroy(&pta);
if (pixadb && pta) { /* display baselines */
l_int32 npts, x1, y1, x2, y2;
pix1 = pixConvertTo32(pixs);
Expand Down
2 changes: 1 addition & 1 deletion version-notes.html
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ <h2 align=center> <IMG SRC="moller52.jpg" border=1 ALIGN_MIDDLE> </h2>
* Source files changed: baseline.c, pageseg.c, pdfapp.c, psio1.c
* Prog files changed: baseline_reg.c, binarizefiles.c,
compresspdf.c, croppdf.c, misctest2.c,
* Prog files added: baseline2.png
* Prog files added: baseline1.png, baseline2.png

1.85.0 Oct 16, 2024
* Use wrapper callSystemDebug() instead of system() in programs.
Expand Down

0 comments on commit 64a2ad3

Please sign in to comment.