From df0d10bd1bb12d1e2077323e1fb5eec75da35a1e Mon Sep 17 00:00:00 2001 From: Erik Schilling Date: Thu, 1 Aug 2019 10:29:04 +0200 Subject: [PATCH] Added the ability to generate outlines for PDFs --outline-tags allows to specify the HTML tags which should be considered for the outline. The tags are expected to be given in order of hierachy, for example, 'h1,h2' will trigger a generation with h1 elements as top level outline entries and h2 as their childs. Ideally this would not be required if Chromium would add this directly. So if these bugs are closed this can probably be removed again: - https://bugs.chromium.org/p/chromium/issues/detail?id=840455 - https://github.com/GoogleChrome/puppeteer/issues/1778 This code is heavily based on @Hopding's comment at: https://github.com/Hopding/pdf-lib/issues/127#issuecomment-502450179 --- README.md | 1 + bin/paged | 6 +++ src/postprocesser.js | 94 ++++++++++++++++++++++++++++++++++++++++++++ src/printer.js | 53 +++++++++++++++++++++++++ 4 files changed, 154 insertions(+) diff --git a/README.md b/README.md index 08b80ba..ea3dfd9 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ pagedjs-cli ./path/to/index.html -o result.pdf -ho, --hypher_only [str] Only hyphenate passed elements selector, such as ".hyphenate, aside" -e, --encoding [type] Set the encoding of the input html, defaults to "utf-8" -t, --timeout [ms] Set a max timeout of [ms] +--outline-tags [tags] Specifies that an outline should be generated for the resulting PDF document. [tags] specifies which HTML tags should be considered for that outline. "h1,h2" will trigger an outline with "h1" tags as root elements and "h2" elements as their childs. ``` ## Hyphenation diff --git a/bin/paged b/bin/paged index 71a10a2..707ae1f 100755 --- a/bin/paged +++ b/bin/paged @@ -26,6 +26,11 @@ program .option('-t, --timeout [ms]', 'Set a max timeout of [ms]') .option('-x, --html', 'output html file') .option('-b, --blockLocal', 'Disallow access to filesystem for local files') + .option('--outline-tags [tags]', 'Specifies that an outline should be ' + + 'generated for the resulting PDF document. [tags] specifies which ' + + 'HTML tags should be considered for that outline. ' + + '"h1,h2" will trigger an outline with "h1" tags as root elements ' + + 'and "h2" elements as their childs.') .parse(process.argv); @@ -122,6 +127,7 @@ if (typeof input === "string") { file = await printer.html(input, options); output = replaceExt(output, '.html'); } else { + options.outlineTags = !program.outlineTags ? [] : program.outlineTags.split(','); file = await printer.pdf(input, options); } } else { diff --git a/src/postprocesser.js b/src/postprocesser.js index 635f277..a52886b 100644 --- a/src/postprocesser.js +++ b/src/postprocesser.js @@ -203,6 +203,100 @@ class PostProcesser extends EventEmitter { console.log(page); } + /** + * Adds a table of content to the generated PDF + * + * Ideally this would not be required if Chromium would add this directly. + * So if these bugs are closed this can probably be removed again: + * - https://bugs.chromium.org/p/chromium/issues/detail?id=840455 + * - https://github.com/GoogleChrome/puppeteer/issues/1778 + * + * This code is heavily based on @Hopding's comment at: + * https://github.com/Hopding/pdf-lib/issues/127#issuecomment-502450179 + */ + addOutline(outlineSpec) { + const outline = JSON.parse(JSON.stringify(outlineSpec)) + + + const pageRefs = []; + this.pdfDoc.catalog.Pages.traverse((kid, ref) => { + if (kid instanceof PDFLib.PDFPage) + pageRefs.push(ref); + }); + const index = this.pdfDoc.index; + + const outlineReference = index.nextObjectNumber(); + + const countOutlineLayer = (layer) => { + let count = 0; + for (const outlineEntry of layer) { + ++count; + count += countOutlineLayer(outlineEntry.children); + } + return count; + } + + const createItemsForOutlineLayer = (layer, parent) => { + layer.forEach((outlineItem, i) => { + let prev = i > 0 ? layer[i - 1].ref : null; + let next = i < layer.length - 1 ? layer[i + 1].ref : null; + const pdfItem = createOutlineItem(outlineItem, prev, next, parent); + index.assign(outlineItem.ref, pdfItem); + }); + } + + const createOutlineItem = (outlineItem, prev, next, parent) => { + if (!outlineItem.id) { + throw new Error(`Cannot generate outline item with title '${outlineItem.title} ` + + `without any target anchor. Please specify an 'id' attribute for ` + + `the relevant HTML element`); + } + const item = { + Title: PDFLib.PDFString.fromString(outlineItem.title), + Parent: parent, + Dest: PDFLib.PDFName.from(outlineItem.id), + }; + if (prev) { + item.Prev = prev; + } + if (next) { + item.Next = next; + } + if (outlineItem.children.length > 0) { + item.First = outlineItem.children[0].ref; + item.Last = outlineItem.children[outlineItem.children.length - 1].ref; + item.Count = PDFLib.PDFNumber.fromNumber(countOutlineLayer(outlineItem.children)); + createItemsForOutlineLayer(outlineItem.children, outlineItem.ref); + } + + return PDFLib.PDFDictionary.from(item, index); + }; + + const createOutlineReferences = (outlineEntry) => { + outlineEntry.ref = index.nextObjectNumber(); + for (const child of outlineEntry.children) { + createOutlineReferences(child); + } + } + + for (const outlineItem of outline) { + createOutlineReferences(outlineItem); + } + + createItemsForOutlineLayer(outline, outlineReference); + + const pdfOutline = PDFLib.PDFDictionary.from( + { + First: outline[0].ref, + Last: outline[outline.length - 1].ref, + Count: PDFLib.PDFNumber.fromNumber(countOutlineLayer(outline)), + }, + index, + ); + index.assign(outlineReference, pdfOutline); + this.pdfDoc.catalog.set('Outlines', outlineReference); + } + save() { let writer = new PDFDocumentWriter(); const pdfBytes = writer.saveToBytesWithXRefTable(this.pdfDoc); diff --git a/src/printer.js b/src/printer.js index 21f6fe6..b8b4f72 100644 --- a/src/printer.js +++ b/src/printer.js @@ -182,6 +182,54 @@ class Printer extends EventEmitter { return page; } + async _parseOutline(page, tags) { + return await page.evaluate((tags) => { + const tagsToProcess = []; + for (const node of document.querySelectorAll(tags.join(','))) { + tagsToProcess.push(node); + } + tagsToProcess.reverse(); + + const root = {children: [], depth: -1}; + let currentOutlineNode = root; + + while (tagsToProcess.length > 0) { + const tag = tagsToProcess.pop(); + const orderDepth = tags.indexOf(tag.tagName.toLowerCase()); + + if (orderDepth < currentOutlineNode.depth) { + currentOutlineNode = currentOutlineNode.parent; + tagsToProcess.push(tag); + } else { + const newNode = { + title: tag.innerText, + id: tag.id, + children: [], + depth: orderDepth, + }; + if (orderDepth == currentOutlineNode.depth) { + newNode.parent = currentOutlineNode.parent; + currentOutlineNode.parent.children.push(newNode); + currentOutlineNode = newNode; + } else if (orderDepth > currentOutlineNode.depth) { + newNode.parent = currentOutlineNode; + currentOutlineNode.children.push(newNode); + currentOutlineNode = newNode; + } + } + } + + const stripParentProperty = (node) => { + node.parent = undefined; + for (const child of node.children) { + stripParentProperty(child); + } + } + stripParentProperty(root) + return root.children; + }, tags); + } + async pdf(input, options={}) { let page = await this.render(input); @@ -201,6 +249,8 @@ class Printer extends EventEmitter { return meta; }); + const outline = options.outlineTags.length > 0 ? await this._parseOutline(page, options.outlineTags) : null; + let settings = { printBackground: true, displayHeaderFooter: false, @@ -228,6 +278,9 @@ class Printer extends EventEmitter { let post = new PostProcesser(pdf); post.metadata(meta); post.boxes(this.pages); + if (outline) { + post.addOutline(outline); + } pdf = post.save(); return pdf;