Skip to content

Commit

Permalink
[text selection] Add the whitespaces present in the pdf in the text c…
Browse files Browse the repository at this point in the history
…hunk

- it aims to fix issue #14627;
- the basic idea of the recent text refactoring was to only consider the rendered visible whitespaces.
  But sometimes, the heuristics aren't correct and although some whitespaces are in the text stream
  they weren't in the text chunks because they were too small. Hence we added some exceptions, for example,
  we always add a whitespace when it is between two non-whitespace chars but only when in the same Tj.
  So basically, this patch removes the constraint to have the chars in the same Tj
  (in using a cyclic buffer to save the two last chars) but don't add a space when the visible space is really
  too small (hence `NOT_A_SPACE_FACTOR`).
  • Loading branch information
calixteman committed Mar 21, 2022
1 parent feea2b7 commit 8b4b920
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 23 deletions.
83 changes: 69 additions & 14 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -2187,17 +2187,50 @@ class PartialEvaluator {
spaceInFlowMax: 0,
trackingSpaceMin: Infinity,
negativeSpaceMax: -Infinity,
notASpace: -Infinity,
transform: null,
fontName: null,
hasEOL: false,
};

// Use a cyclic buffer (length === 2) to save the last chars in the
// text stream.
// It's useful to know when we need to add a whitespace in the
// text chunk.
const twoLastChars = [" ", " "];
let twoLastCharsPos = 0;

/**
* Save the last char.
* @param {string} char
* @returns {boolean} true when the two last chars before adding the new one
* are a non-whitespace followed by a whitespace.
*/
function saveLastChar(char) {
const nextPos = (twoLastCharsPos + 1) % 2;
const ret =
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
twoLastChars[twoLastCharsPos] = char;
twoLastCharsPos = nextPos;

return ret;
}

function resetLastChars() {
twoLastChars[0] = twoLastChars[1] = " ";
twoLastCharsPos = 0;
}

// Used in addFakeSpaces.

// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
// so it doesn't count as a space.
const TRACKING_SPACE_FACTOR = 0.1;

// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
// even if one is present in the text stream.
const NOT_A_SPACE_FACTOR = 0.05;

// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
// a break (a new chunk of text is created).
// It doesn't change anything when the text is copied but
Expand Down Expand Up @@ -2299,6 +2332,7 @@ class PartialEvaluator {

textContentItem.trackingSpaceMin =
textState.fontSize * TRACKING_SPACE_FACTOR;
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
textContentItem.negativeSpaceMax =
textState.fontSize * NEGATIVE_SPACE_FACTOR;
textContentItem.spaceInFlowMin =
Expand Down Expand Up @@ -2483,6 +2517,7 @@ class PartialEvaluator {
return true;
}

resetLastChars();
flushTextContentItem();
return true;
}
Expand All @@ -2491,7 +2526,17 @@ class PartialEvaluator {
appendEOL();
return true;
}

if (advanceY <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}

if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
if (advanceY <= textContentItem.notASpace) {
resetLastChars();
}
textContentItem.height += advanceY;
} else if (
!addFakeSpaces(
Expand All @@ -2501,6 +2546,7 @@ class PartialEvaluator {
)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
Expand Down Expand Up @@ -2532,6 +2578,10 @@ class PartialEvaluator {
appendEOL();
return true;
}

// We're moving back so in case the last char was a whitespace
// we cancel it: it doesn't make sense to insert it.
resetLastChars();
flushTextContentItem();
return true;
}
Expand All @@ -2541,12 +2591,19 @@ class PartialEvaluator {
return true;
}

if (advanceX <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}

if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.width += advanceX;
} else if (
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
Expand Down Expand Up @@ -2600,14 +2657,7 @@ class PartialEvaluator {
}
let scaledDim = glyphWidth * scale;

if (
glyph.isWhitespace &&
(i === 0 ||
i + 1 === ii ||
glyphs[i - 1].isWhitespace ||
glyphs[i + 1].isWhitespace ||
extraSpacing)
) {
if (glyph.isWhitespace) {
// Don't push a " " in the textContentItem
// (except when it's between two non-spaces chars),
// it will be done (if required) in next call to
Expand All @@ -2623,6 +2673,7 @@ class PartialEvaluator {
charSpacing += -scaledDim + textState.wordSpacing;
textState.translateTextMatrix(0, -charSpacing);
}
saveLastChar(" ");
continue;
}

Expand Down Expand Up @@ -2653,17 +2704,18 @@ class PartialEvaluator {
textChunk.prevTransform = getCurrentTextTransform();
}

if (glyph.isWhitespace) {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
if (saveLastChar(glyphUnicode)) {
// The two last chars are a non-whitespace followed by a whitespace
// and then this non-whitespace, so we insert a whitespace here.
// Replaces all whitespaces with standard spaces (0x20), to avoid
// alignment issues between the textLayer and the canvas if the text
// contains e.g. tabs (fixes issue6612.pdf).
textChunk.str.push(" ");
} else {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
textChunk.str.push(glyphUnicode);
}
textChunk.str.push(glyphUnicode);

if (charSpacing) {
if (!font.vertical) {
Expand All @@ -2679,6 +2731,7 @@ class PartialEvaluator {
}

function appendEOL() {
resetLastChars();
if (textContentItem.initialized) {
textContentItem.hasEOL = true;
flushTextContentItem();
Expand All @@ -2701,6 +2754,7 @@ class PartialEvaluator {
width <= textOrientation * textContentItem.spaceInFlowMax
) {
if (textContentItem.initialized) {
resetLastChars();
textContentItem.str.push(" ");
}
return false;
Expand All @@ -2715,6 +2769,7 @@ class PartialEvaluator {
}

flushTextContentItem();
resetLastChars();
textContent.items.push({
str: " ",
// TODO: check if using the orientation from last chunk is
Expand Down
1 change: 1 addition & 0 deletions test/pdfs/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,4 @@
!issue14497.pdf
!issue14502.pdf
!issue13211.pdf
!issue14627.pdf
Binary file added test/pdfs/issue14627.pdf
Binary file not shown.
6 changes: 6 additions & 0 deletions test/test_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -6329,5 +6329,11 @@
"md5": "d193853e8a123dc50eeea593a4150b60",
"rounds": 1,
"type": "eq"
},
{ "id": "issue14627",
"file": "pdfs/issue14627.pdf",
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
"rounds": 1,
"type": "text"
}
]
2 changes: 1 addition & 1 deletion test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -1999,7 +1999,7 @@ describe("api", function () {
const data = await Promise.all([defaultPromise, parametersPromise]);

expect(!!data[0].items).toEqual(true);
expect(data[0].items.length).toEqual(11);
expect(data[0].items.length).toEqual(15);
expect(!!data[0].styles).toEqual(true);

const page1 = mergeText(data[0].items);
Expand Down
16 changes: 8 additions & 8 deletions test/unit/pdf_find_controller_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
},
pageMatches: [
[
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
4071,
302, 340, 418, 480, 627, 801, 982, 988, 1014, 1062, 1083, 1148, 1156,
1277, 1345, 1393, 1401, 1423, 1499, 1523, 1529, 1685, 1775, 1787,
1858, 1880, 1910, 1947, 2065, 2075, 2162, 2179, 2214, 2228, 2273,
2323, 2359, 2401, 2412, 2423, 2462, 2531, 2537, 2552, 2561, 2575,
2601, 2612, 2637, 2667, 2791, 2804, 2835, 2846, 2857, 2894, 2900,
2914, 2938, 2958, 3088, 3235, 3245, 3335, 3383, 3390, 3464, 3473,
3481, 3498, 3686, 3692, 3707, 3754, 3785, 3861, 3973, 4048, 4054,
4067,
],
],
pageMatchesLength: [
Expand Down

0 comments on commit 8b4b920

Please sign in to comment.