Skip to content

Commit

Permalink
PDFBOX-3774: conditionally ignore spaces from the content stream; add…
Browse files Browse the repository at this point in the history
… setting + getter/setter + test + code simplification by Kevin Day

git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1922535 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
THausherr committed Dec 16, 2024
1 parent 090c983 commit 76e1ae1
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 5 deletions.
40 changes: 35 additions & 5 deletions pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine
private boolean shouldSeparateByBeads = true;
private boolean sortByPosition = false;
private boolean addMoreFormatting = false;
private boolean ignoreContentStreamSpaceGlyphs = false;

private float indentThreshold = defaultIndentThreshold;
private float dropThreshold = defaultDropThreshold;
Expand Down Expand Up @@ -523,11 +524,8 @@ protected void writePage() throws IOException
{
IterativeMergeSort.sort(textList, comparator);
}
finally
{
// PDFBOX-5487: Remove all space characters if contained within the adjacent letters
removeContainedSpaces(textList);
}
// PDFBOX-5487: Remove all space characters if contained within the adjacent letters
removeContainedSpaces(textList);
}

startArticle();
Expand Down Expand Up @@ -555,6 +553,12 @@ protected void writePage() throws IOException
PositionWrapper current = new PositionWrapper(position);
String characterValue = position.getUnicode();

// PDFBOX-3774: conditionally ignore spaces from the content stream
if (" ".equals(characterValue) && getIgnoreContentStreamSpaceGlyphs())
{
continue;
}

// Resets the average character width when we see a change in font
// or a change in the font size
if (lastPosition != null &&
Expand Down Expand Up @@ -1276,6 +1280,32 @@ public void setSortByPosition(boolean newSortByPosition)
sortByPosition = newSortByPosition;
}

/**
* Determines whether spaces in the content stream text rendering instructions will be ignored
* during text extraction.
*
* @return true is space glyphs in the content stream text rendering instructions will be
* ignored - default is false
*/
public boolean getIgnoreContentStreamSpaceGlyphs()
{
return ignoreContentStreamSpaceGlyphs;
}

/**
* Instruct the algorithm to ignore any spaces in the text rendering instructions in the content
* stream, and instead rely purely on the algorithm to determine where word breaks are.
*
* This can improve text extraction results where the content stream is sorted by position and
* has text overlapping spaces, but could cause some word breaks to not be added to the output
*
* @param newIgnoreContentStreamSpaceGlyphs whether PDF Box should ignore context stream spaces
*/
public void setIgnoreContentStreamSpaceGlyphs(boolean newIgnoreContentStreamSpaceGlyphs)
{
ignoreContentStreamSpaceGlyphs = newIgnoreContentStreamSpaceGlyphs;
}

/**
* Get the current space width-based tolerance value that is being used to estimate where spaces in text should be
* added. Note that the default value for this has been determined from trial and error.
Expand Down
51 changes: 51 additions & 0 deletions pdfbox/src/test/java/org/apache/pdfbox/text/TestTextStripper.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,13 @@
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
Expand Down Expand Up @@ -688,4 +692,51 @@ void testStartEndPage() throws IOException
assertEquals(1378, text.replaceAll("\r", "").length());
}
}

/**
* PDFBOX-3774: test the IgnoreContentStreamSpaceGlyphs option.
*
* @throws Exception
*/
@Test
void testIgnoreContentStreamSpaceGlyphs() throws Exception
{
try (PDDocument doc = new PDDocument())
{
PDPage page = new PDPage();
try (PDPageContentStream cs = new PDPageContentStream(doc, page))
{
float fontHeight = 8;
float x = 50;
float y = page.getMediaBox().getHeight() - 50;
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
cs.beginText();
cs.setFont(font, fontHeight);
cs.newLineAtOffset(x, y);
cs.showText("( )");
cs.endText();

int indent = 6;
float overlapX = x + indent * font.getAverageFontWidth() / 1000f * fontHeight;
PDFont overlapFont = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN);
cs.beginText();
cs.setFont(overlapFont, fontHeight * 2f);
cs.newLineAtOffset(overlapX, y);
cs.showText("overlap");
cs.endText();
}
doc.addPage(page);

PDFTextStripper localStripper = new PDFTextStripper();
localStripper.setLineSeparator("\n");
localStripper.setPageEnd("\n");
localStripper.setStartPage(1);
localStripper.setEndPage(1);
localStripper.setSortByPosition(true);

localStripper.setIgnoreContentStreamSpaceGlyphs(true);
String text = localStripper.getText(doc);
assertEquals("( overlap )\n", text);
}
}
}

0 comments on commit 76e1ae1

Please sign in to comment.