Skip to content

Commit

Permalink
Added the ability to generate outlines for PDFs
Browse files Browse the repository at this point in the history
--outline-tags allows to specify the HTML tags which should be
considered for the outline. The tags are expected to be given in
order of hierachy, for example, 'h1,h2' will trigger a generation
with h1 elements as top level outline entries and h2 as their
childs.

Ideally this would not be required if Chromium would add
this directly. So if these bugs are closed this can probably be
removed again:
- https://bugs.chromium.org/p/chromium/issues/detail?id=840455
- puppeteer/puppeteer#1778

This code is heavily based on @Hopding's comment at:
Hopding/pdf-lib#127 (comment)
  • Loading branch information
Erik Schilling committed Aug 1, 2019
1 parent 2e995af commit df0d10b
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pagedjs-cli ./path/to/index.html -o result.pdf
-ho, --hypher_only [str] Only hyphenate passed elements selector, such as ".hyphenate, aside"
-e, --encoding [type] Set the encoding of the input html, defaults to "utf-8"
-t, --timeout [ms] Set a max timeout of [ms]
--outline-tags [tags] Specifies that an outline should be generated for the resulting PDF document. [tags] specifies which HTML tags should be considered for that outline. "h1,h2" will trigger an outline with "h1" tags as root elements and "h2" elements as their childs.
```

## Hyphenation
Expand Down
6 changes: 6 additions & 0 deletions bin/paged
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ program
.option('-t, --timeout [ms]', 'Set a max timeout of [ms]')
.option('-x, --html', 'output html file')
.option('-b, --blockLocal', 'Disallow access to filesystem for local files')
.option('--outline-tags [tags]', 'Specifies that an outline should be ' +
'generated for the resulting PDF document. [tags] specifies which ' +
'HTML tags should be considered for that outline. ' +
'"h1,h2" will trigger an outline with "h1" tags as root elements ' +
'and "h2" elements as their childs.')
.parse(process.argv);


Expand Down Expand Up @@ -122,6 +127,7 @@ if (typeof input === "string") {
file = await printer.html(input, options);
output = replaceExt(output, '.html');
} else {
options.outlineTags = !program.outlineTags ? [] : program.outlineTags.split(',');
file = await printer.pdf(input, options);
}
} else {
Expand Down
94 changes: 94 additions & 0 deletions src/postprocesser.js
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,100 @@ class PostProcesser extends EventEmitter {
console.log(page);
}

/**
* Adds a table of content to the generated PDF
*
* Ideally this would not be required if Chromium would add this directly.
* So if these bugs are closed this can probably be removed again:
* - https://bugs.chromium.org/p/chromium/issues/detail?id=840455
* - https://github.com/GoogleChrome/puppeteer/issues/1778
*
* This code is heavily based on @Hopding's comment at:
* https://github.com/Hopding/pdf-lib/issues/127#issuecomment-502450179
*/
addOutline(outlineSpec) {
const outline = JSON.parse(JSON.stringify(outlineSpec))


const pageRefs = [];
this.pdfDoc.catalog.Pages.traverse((kid, ref) => {
if (kid instanceof PDFLib.PDFPage)
pageRefs.push(ref);
});
const index = this.pdfDoc.index;

const outlineReference = index.nextObjectNumber();

const countOutlineLayer = (layer) => {
let count = 0;
for (const outlineEntry of layer) {
++count;
count += countOutlineLayer(outlineEntry.children);
}
return count;
}

const createItemsForOutlineLayer = (layer, parent) => {
layer.forEach((outlineItem, i) => {
let prev = i > 0 ? layer[i - 1].ref : null;
let next = i < layer.length - 1 ? layer[i + 1].ref : null;
const pdfItem = createOutlineItem(outlineItem, prev, next, parent);
index.assign(outlineItem.ref, pdfItem);
});
}

const createOutlineItem = (outlineItem, prev, next, parent) => {
if (!outlineItem.id) {
throw new Error(`Cannot generate outline item with title '${outlineItem.title} ` +
`without any target anchor. Please specify an 'id' attribute for ` +
`the relevant HTML element`);
}
const item = {
Title: PDFLib.PDFString.fromString(outlineItem.title),
Parent: parent,
Dest: PDFLib.PDFName.from(outlineItem.id),
};
if (prev) {
item.Prev = prev;
}
if (next) {
item.Next = next;
}
if (outlineItem.children.length > 0) {
item.First = outlineItem.children[0].ref;
item.Last = outlineItem.children[outlineItem.children.length - 1].ref;
item.Count = PDFLib.PDFNumber.fromNumber(countOutlineLayer(outlineItem.children));
createItemsForOutlineLayer(outlineItem.children, outlineItem.ref);
}

return PDFLib.PDFDictionary.from(item, index);
};

const createOutlineReferences = (outlineEntry) => {
outlineEntry.ref = index.nextObjectNumber();
for (const child of outlineEntry.children) {
createOutlineReferences(child);
}
}

for (const outlineItem of outline) {
createOutlineReferences(outlineItem);
}

createItemsForOutlineLayer(outline, outlineReference);

const pdfOutline = PDFLib.PDFDictionary.from(
{
First: outline[0].ref,
Last: outline[outline.length - 1].ref,
Count: PDFLib.PDFNumber.fromNumber(countOutlineLayer(outline)),
},
index,
);
index.assign(outlineReference, pdfOutline);
this.pdfDoc.catalog.set('Outlines', outlineReference);
}

save() {
let writer = new PDFDocumentWriter();
const pdfBytes = writer.saveToBytesWithXRefTable(this.pdfDoc);
Expand Down
53 changes: 53 additions & 0 deletions src/printer.js
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,54 @@ class Printer extends EventEmitter {
return page;
}

async _parseOutline(page, tags) {
return await page.evaluate((tags) => {
const tagsToProcess = [];
for (const node of document.querySelectorAll(tags.join(','))) {
tagsToProcess.push(node);
}
tagsToProcess.reverse();

const root = {children: [], depth: -1};
let currentOutlineNode = root;

while (tagsToProcess.length > 0) {
const tag = tagsToProcess.pop();
const orderDepth = tags.indexOf(tag.tagName.toLowerCase());

if (orderDepth < currentOutlineNode.depth) {
currentOutlineNode = currentOutlineNode.parent;
tagsToProcess.push(tag);
} else {
const newNode = {
title: tag.innerText,
id: tag.id,
children: [],
depth: orderDepth,
};
if (orderDepth == currentOutlineNode.depth) {
newNode.parent = currentOutlineNode.parent;
currentOutlineNode.parent.children.push(newNode);
currentOutlineNode = newNode;
} else if (orderDepth > currentOutlineNode.depth) {
newNode.parent = currentOutlineNode;
currentOutlineNode.children.push(newNode);
currentOutlineNode = newNode;
}
}
}

const stripParentProperty = (node) => {
node.parent = undefined;
for (const child of node.children) {
stripParentProperty(child);
}
}
stripParentProperty(root)
return root.children;
}, tags);
}

async pdf(input, options={}) {
let page = await this.render(input);

Expand All @@ -201,6 +249,8 @@ class Printer extends EventEmitter {
return meta;
});

const outline = options.outlineTags.length > 0 ? await this._parseOutline(page, options.outlineTags) : null;

let settings = {
printBackground: true,
displayHeaderFooter: false,
Expand Down Expand Up @@ -228,6 +278,9 @@ class Printer extends EventEmitter {
let post = new PostProcesser(pdf);
post.metadata(meta);
post.boxes(this.pages);
if (outline) {
post.addOutline(outline);
}
pdf = post.save();

return pdf;
Expand Down

0 comments on commit df0d10b

Please sign in to comment.