From 85da492945ab495227240c42492afa8afe496738 Mon Sep 17 00:00:00 2001 From: koy Date: Thu, 27 Jun 2024 22:18:03 +0800 Subject: [PATCH 1/4] fix(search): clean markdown elements in search contents --- src/plugins/search/markdown-to-txt.js | 61 +++++++++++++++++++++++++++ src/plugins/search/search.js | 27 +++++++----- test/e2e/search.test.js | 19 +++++++++ 3 files changed, 97 insertions(+), 10 deletions(-) create mode 100644 src/plugins/search/markdown-to-txt.js diff --git a/src/plugins/search/markdown-to-txt.js b/src/plugins/search/markdown-to-txt.js new file mode 100644 index 000000000..dd3ee6d87 --- /dev/null +++ b/src/plugins/search/markdown-to-txt.js @@ -0,0 +1,61 @@ +/** + * This is a modified version of the + * [markdown-to-txt](https://www.npmjs.com/package/markdown-to-txt) library. + */ +import { marked } from 'marked'; +import { escape, unescape } from 'lodash'; +const block = text => text + '\n\n'; +const escapeBlock = text => escape(text) + '\n\n'; +const line = text => text + '\n'; +const inline = text => text; +const newline = () => '\n'; +const empty = () => ''; + +const TxtRenderer = { + // Block elements + code: escapeBlock, + blockquote: block, + html: empty, + heading: block, + hr: newline, + list: text => block(text.trim()), + listitem: line, + checkbox: empty, + paragraph: block, + table: (header, body) => line(header + body), + tablerow: text => line(text.trim()), + tablecell: text => text + ' ', + // Inline elements + strong: inline, + em: inline, + codespan: inline, + br: newline, + del: inline, + link: (_0, _1, text) => text, + image: (_0, _1, text) => text, + text: inline, + // etc. + options: {}, +}; + +/** + * Converts markdown to plaintext using the marked Markdown library. + * Accepts [MarkedOptions](https://marked.js.org/using_advanced#options) as + * the second argument. + * + * NOTE: The output of markdownToTxt is NOT sanitized. The output may contain + * valid HTML, JavaScript, etc. Be sure to sanitize if the output is intended + * for web use. + * + * @param markdown the markdown text to txtify + * @param options the marked options + * @returns the unmarked text + */ +export function markdownToTxt(markdown, options) { + const unmarked = marked(markdown, { ...options, renderer: TxtRenderer }); + const unescaped = unescape(unmarked); + const trimmed = unescaped.trim(); + return trimmed; +} + +export default markdownToTxt; diff --git a/src/plugins/search/search.js b/src/plugins/search/search.js index 95aa83a8b..671fb9b3d 100644 --- a/src/plugins/search/search.js +++ b/src/plugins/search/search.js @@ -2,6 +2,7 @@ import { getAndRemoveConfig, getAndRemoveDocisfyIgnoreConfig, } from '../../core/render/utils.js'; +import { markdownToTxt } from './markdown-to-txt.js'; let INDEXS = {}; @@ -34,6 +35,17 @@ function escapeHtml(string) { return String(string).replace(/[&<>"']/g, s => entityMap[s]); } +function formatContent(text) { + return escapeHtml(cleanMarkdown(ignoreDiacriticalMarks(text))); +} + +function cleanMarkdown(text) { + if (text) { + text = markdownToTxt(text); + } + return text; +} + function getAllPaths(router) { const paths = []; @@ -175,19 +187,14 @@ export function search(query) { keywords.forEach(keyword => { // From https://github.com/sindresorhus/escape-string-regexp const regEx = new RegExp( - escapeHtml(ignoreDiacriticalMarks(keyword)).replace( - /[|\\{}()[\]^$+*?.]/g, - '\\$&', - ), + formatContent(keyword).replace(/[|\\{}()[\]^$+*?.]/g, '\\$&'), 'gi', ); let indexTitle = -1; let indexContent = -1; - handlePostTitle = postTitle - ? escapeHtml(ignoreDiacriticalMarks(postTitle)) - : postTitle; + handlePostTitle = postTitle ? formatContent(postTitle) : postTitle; handlePostContent = postContent - ? escapeHtml(ignoreDiacriticalMarks(postContent)) + ? formatContent(postContent) : postContent; indexTitle = postTitle ? handlePostTitle.search(regEx) : -1; @@ -226,8 +233,8 @@ export function search(query) { if (matchesScore > 0) { const matchingPost = { - title: handlePostTitle, - content: postContent ? resultStr : '', + title: formatContent(handlePostTitle), + content: formatContent(postContent ? resultStr : ''), url: postUrl, score: matchesScore, }; diff --git a/test/e2e/search.test.js b/test/e2e/search.test.js index ecf38ec2e..dac99396f 100644 --- a/test/e2e/search.test.js +++ b/test/e2e/search.test.js @@ -232,4 +232,23 @@ test.describe('Search Plugin Tests', () => { await page.keyboard.press('z'); await expect(searchFieldElm).toBeFocused(); }); + test('search result should remove markdown', async ({ page }) => { + const docsifyInitConfig = { + markdown: { + homepage: ` + # The [mock](example.com) link + There is lots of words. + `, + }, + scriptURLs: ['/dist/plugins/search.js'], + }; + + const searchFieldElm = page.locator('input[type=search]'); + const resultsHeadingElm = page.locator('.results-panel h2'); + + await docsifyInit(docsifyInitConfig); + + await searchFieldElm.fill('There'); + await expect(resultsHeadingElm).toHaveText('The mock link'); + }); }); From 144b4f809a2c18aee86cb3d3ee86a64cda42577f Mon Sep 17 00:00:00 2001 From: koy Date: Thu, 1 Aug 2024 21:46:46 +0800 Subject: [PATCH 2/4] update: refactor --- src/plugins/search/component.js | 2 +- src/plugins/search/markdown-to-txt.js | 240 ++++++++++++++++++++------ src/plugins/search/search.js | 32 ++-- test/e2e/search.test.js | 94 +++++++++- 4 files changed, 290 insertions(+), 78 deletions(-) diff --git a/src/plugins/search/component.js b/src/plugins/search/component.js index 1fb4f36f2..8496454a5 100644 --- a/src/plugins/search/component.js +++ b/src/plugins/search/component.js @@ -53,7 +53,7 @@ function doSearch(value) {

${post.title}

-

${post.content}

+

...${post.content}...

`; diff --git a/src/plugins/search/markdown-to-txt.js b/src/plugins/search/markdown-to-txt.js index dd3ee6d87..90e042be6 100644 --- a/src/plugins/search/markdown-to-txt.js +++ b/src/plugins/search/markdown-to-txt.js @@ -1,61 +1,197 @@ /** - * This is a modified version of the - * [markdown-to-txt](https://www.npmjs.com/package/markdown-to-txt) library. + * This is a function to convert markdown to txt based on markedjs v13+. + * Copies the escape/unescape functions from [lodash](https://www.npmjs.com/package/lodash) instead import to reduce the size. */ import { marked } from 'marked'; -import { escape, unescape } from 'lodash'; -const block = text => text + '\n\n'; -const escapeBlock = text => escape(text) + '\n\n'; -const line = text => text + '\n'; -const inline = text => text; -const newline = () => '\n'; -const empty = () => ''; - -const TxtRenderer = { - // Block elements - code: escapeBlock, - blockquote: block, - html: empty, - heading: block, - hr: newline, - list: text => block(text.trim()), - listitem: line, - checkbox: empty, - paragraph: block, - table: (header, body) => line(header + body), - tablerow: text => line(text.trim()), - tablecell: text => text + ' ', - // Inline elements - strong: inline, - em: inline, - codespan: inline, - br: newline, - del: inline, - link: (_0, _1, text) => text, - image: (_0, _1, text) => text, - text: inline, - // etc. - options: {}, + +const reEscapedHtml = /&(?:amp|lt|gt|quot|#(0+)?39);/g; +const reHasEscapedHtml = RegExp(reEscapedHtml.source); +const htmlUnescapes = { + '&': '&', + '<': '<', + '>': '>', + '"': '"', + ''': "'", }; -/** - * Converts markdown to plaintext using the marked Markdown library. - * Accepts [MarkedOptions](https://marked.js.org/using_advanced#options) as - * the second argument. - * - * NOTE: The output of markdownToTxt is NOT sanitized. The output may contain - * valid HTML, JavaScript, etc. Be sure to sanitize if the output is intended - * for web use. - * - * @param markdown the markdown text to txtify - * @param options the marked options - * @returns the unmarked text - */ -export function markdownToTxt(markdown, options) { - const unmarked = marked(markdown, { ...options, renderer: TxtRenderer }); +function unescape(string) { + return string && reHasEscapedHtml.test(string) + ? string.replace(reEscapedHtml, entity => htmlUnescapes[entity] || "'") + : string || ''; +} + +const reUnescapedHtml = /[&<>"']/g; +const reHasUnescapedHtml = RegExp(reUnescapedHtml.source); +const htmlEscapes = { + '&': '&', + '<': '<', + '>': '>', + '"': '"', + "'": ''', +}; + +function escape(string) { + return string && reHasUnescapedHtml.test(string) + ? string.replace(reUnescapedHtml, chr => htmlEscapes[chr]) + : string || ''; +} + +function helpersCleanup(string) { + return string && string.replace('!>', '').replace('?>', ''); +} + +const markdownToTxtRenderer = { + space() { + return ''; + }, + + code({ text }) { + const code = text.replace(/\n$/, ''); + return escape(code); + }, + + blockquote({ tokens }) { + return this.parser?.parse(tokens) || ''; + }, + + html() { + return ''; + }, + + heading({ tokens }) { + return this.parser?.parse(tokens) || ''; + }, + + hr() { + return ''; + }, + + list(token) { + let body = ''; + for (let j = 0; j < token.items.length; j++) { + const item = token.items[j]; + body += this.listitem?.(item); + } + + return body; + }, + + listitem(item) { + let itemBody = ''; + if (item.task) { + const checkbox = this.checkbox?.({ checked: !!item.checked }); + if (item.loose) { + if (item.tokens.length > 0 && item.tokens[0].type === 'paragraph') { + item.tokens[0].text = checkbox + ' ' + item.tokens[0].text; + if ( + item.tokens[0].tokens && + item.tokens[0].tokens.length > 0 && + item.tokens[0].tokens[0].type === 'text' + ) { + item.tokens[0].tokens[0].text = + checkbox + ' ' + item.tokens[0].tokens[0].text; + } + } else { + item.tokens.unshift({ + type: 'text', + raw: checkbox + ' ', + text: checkbox + ' ', + }); + } + } else { + itemBody += checkbox + ' '; + } + } + + itemBody += this.parser?.parse(item.tokens, !!item.loose); + + return `${itemBody || ''}`; + }, + + checkbox() { + return ''; + }, + + paragraph({ tokens }) { + return this.parser?.parseInline(tokens) || ''; + }, + + table(token) { + let header = ''; + + let cell = ''; + for (let j = 0; j < token.header.length; j++) { + cell += this.tablecell?.(token.header[j]); + } + header += this.tablerow?.({ text: cell }); + + let body = ''; + for (let j = 0; j < token.rows.length; j++) { + const row = token.rows[j]; + + cell = ''; + for (let k = 0; k < row.length; k++) { + cell += this.tablecell?.(row[k]); + } + + body += this.tablerow?.({ text: cell }); + } + + return header + ' ' + body; + }, + + tablerow({ text }) { + return text; + }, + + tablecell(token) { + return this.parser?.parseInline(token.tokens) || ''; + }, + + strong({ text }) { + return text; + }, + + em({ tokens }) { + return this.parser?.parseInline(tokens) || ''; + }, + + codespan({ text }) { + return text; + }, + + br() { + return ' '; + }, + + del({ tokens }) { + return this.parser?.parseInline(tokens); + }, + + link({ tokens, href, title }) { + // Remain the href and title attributes for searching, so is the image + // e.g. [filename](_media/example.js ':include :type=code :fragment=demo') + // Result: filename _media/example.js :include :type=code :fragment=demo + return `${this.parser?.parseInline(tokens) || ''} ${href || ''} ${title || ''}`; + }, + + image({ title, text, href }) { + return `${text || ''} ${href || ''} ${title || ''}`; + }, + + text(token) { + return token.tokens + ? this.parser?.parseInline(token.tokens) || '' + : token.text || ''; + }, +}; +const _marked = marked.setOptions({ renderer: markdownToTxtRenderer }); + +export function markdownToTxt(markdown) { + const unmarked = _marked.parse(markdown); const unescaped = unescape(unmarked); - const trimmed = unescaped.trim(); - return trimmed; + const helpersCleaned = helpersCleanup(unescaped); + return helpersCleaned.trim(); } export default markdownToTxt; diff --git a/src/plugins/search/search.js b/src/plugins/search/search.js index 8bd3fa805..487718bff 100644 --- a/src/plugins/search/search.js +++ b/src/plugins/search/search.js @@ -60,17 +60,6 @@ function escapeHtml(string) { return String(string).replace(/[&<>"']/g, s => entityMap[s]); } -function formatContent(text) { - return escapeHtml(cleanMarkdown(ignoreDiacriticalMarks(text))); -} - -function cleanMarkdown(text) { - if (text) { - text = markdownToTxt(text); - } - return text; -} - function getAllPaths(router) { const paths = []; @@ -146,7 +135,7 @@ export function genIndex(path, content = '', router, depth, indexKey) { index[slug] = { slug, title: path !== '/' ? path.slice(1) : 'Home Page', - body: token.text || '', + body: markdownToTxt(token.text || ''), path: path, indexKey: indexKey, }; @@ -162,12 +151,12 @@ export function genIndex(path, content = '', router, depth, indexKey) { token.text = getTableData(token); token.text = getListData(token); - index[slug].body += '\n' + (token.text || ''); + index[slug].body += '\n' + markdownToTxt(token.text || ''); } else { token.text = getTableData(token); token.text = getListData(token); - index[slug].body = token.text || ''; + index[slug].body = markdownToTxt(token.text || ''); } index[slug].path = path; @@ -211,14 +200,19 @@ export function search(query) { keywords.forEach(keyword => { // From https://github.com/sindresorhus/escape-string-regexp const regEx = new RegExp( - formatContent(keyword).replace(/[|\\{}()[\]^$+*?.]/g, '\\$&'), + escapeHtml(ignoreDiacriticalMarks(keyword)).replace( + /[|\\{}()[\]^$+*?.]/g, + '\\$&', + ), 'gi', ); let indexTitle = -1; let indexContent = -1; - handlePostTitle = postTitle ? formatContent(postTitle) : postTitle; + handlePostTitle = postTitle + ? escapeHtml(ignoreDiacriticalMarks(postTitle)) + : postTitle; handlePostContent = postContent - ? formatContent(postContent) + ? escapeHtml(ignoreDiacriticalMarks(postContent)) : postContent; indexTitle = postTitle ? handlePostTitle.search(regEx) : -1; @@ -252,8 +246,8 @@ export function search(query) { if (matchesScore > 0) { const matchingPost = { - title: formatContent(handlePostTitle), - content: formatContent(postContent ? resultStr : ''), + title: handlePostTitle, + content: postContent ? resultStr : '', url: postUrl, score: matchesScore, }; diff --git a/test/e2e/search.test.js b/test/e2e/search.test.js index ed73e6c2a..8fc04adf6 100644 --- a/test/e2e/search.test.js +++ b/test/e2e/search.test.js @@ -232,23 +232,105 @@ test.describe('Search Plugin Tests', () => { await page.keyboard.press('z'); await expect(searchFieldElm).toBeFocused(); }); - test('search result should remove markdown', async ({ page }) => { + test('search result should remove markdown code block', async ({ page }) => { const docsifyInitConfig = { markdown: { homepage: ` - # The [mock](example.com) link - There is lots of words. +# Hello World + +searchHere +\`\`\`js +console.log('Hello World'); +\`\`\` `, }, scriptURLs: ['/dist/plugins/search.js'], }; const searchFieldElm = page.locator('input[type=search]'); - const resultsHeadingElm = page.locator('.results-panel h2'); + const resultsHeadingElm = page.locator('.results-panel .content'); await docsifyInit(docsifyInitConfig); + await searchFieldElm.fill('searchHere'); + // there is a newline after searchHere and the markdown part ```js ``` it should be removed + expect(await resultsHeadingElm.textContent()).toContain( + "...searchHere\nconsole.log('Hello Worl...", + ); + }); - await searchFieldElm.fill('There'); - await expect(resultsHeadingElm).toHaveText('The mock link'); + test('search result should remove file markdown and keep href attribution for files', async ({ + page, + }) => { + const docsifyInitConfig = { + markdown: { + homepage: ` +# Hello World +![filename](_media/example.js ':include :type=code :fragment=demo') + `, + }, + scriptURLs: ['/dist/plugins/search.js'], + }; + + const searchFieldElm = page.locator('input[type=search]'); + const resultsHeadingElm = page.locator('.results-panel .content'); + + await docsifyInit(docsifyInitConfig); + await searchFieldElm.fill('filename'); + expect(await resultsHeadingElm.textContent()).toContain( + '...filename _media/example.js :include :type=code :fragment=demo...', + ); + }); + + test('search result should remove checkbox markdown and keep related values', async ({ + page, + }) => { + const docsifyInitConfig = { + markdown: { + homepage: ` +# Hello World + +- [ ] Task 1 +- [x] SearchHere +- [ ] Task 3 + `, + }, + scriptURLs: ['/dist/plugins/search.js'], + }; + + const searchFieldElm = page.locator('input[type=search]'); + const resultsHeadingElm = page.locator('.results-panel .content'); + + await docsifyInit(docsifyInitConfig); + await searchFieldElm.fill('SearchHere'); + // remove the checkbox markdown and keep the related values + expect(await resultsHeadingElm.textContent()).toContain( + '...Task 1 SearchHere Task 3...', + ); + }); + + test('search result should remove docsify self helper markdown and keep related values', async ({ + page, + }) => { + const docsifyInitConfig = { + markdown: { + homepage: ` +# Hello World + +!> SearchHere to check it! + + `, + }, + scriptURLs: ['/dist/plugins/search.js'], + }; + + const searchFieldElm = page.locator('input[type=search]'); + const resultsHeadingElm = page.locator('.results-panel .content'); + + await docsifyInit(docsifyInitConfig); + await searchFieldElm.fill('SearchHere'); + // remove the helper markdown and keep the related values + expect(await resultsHeadingElm.textContent()).toContain( + '...SearchHere to check it!...', + ); }); }); From 7fcc6bcbed94129bcbf486db2c81b7660e97d111 Mon Sep 17 00:00:00 2001 From: koy Date: Thu, 1 Aug 2024 23:56:14 +0800 Subject: [PATCH 3/4] update: only append ellipses on content not empty --- src/plugins/search/component.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/search/component.js b/src/plugins/search/component.js index 8496454a5..2bce54b97 100644 --- a/src/plugins/search/component.js +++ b/src/plugins/search/component.js @@ -49,11 +49,12 @@ function doSearch(value) { let html = ''; matches.forEach((post, i) => { + const content = post.content ? `...${post.content}...` : ''; html += /* html */ `

${post.title}

-

...${post.content}...

+

${content}

`; From 09cf22133f61d1fb6eb416bf7081e5493e19242c Mon Sep 17 00:00:00 2001 From: koy Date: Thu, 19 Sep 2024 13:32:41 +0800 Subject: [PATCH 4/4] update: fix mismatch search content --- src/plugins/search/search.js | 4 ++-- test/e2e/search.test.js | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/search/search.js b/src/plugins/search/search.js index 487718bff..d4e33e3c3 100644 --- a/src/plugins/search/search.js +++ b/src/plugins/search/search.js @@ -230,8 +230,8 @@ export function search(query) { start = indexContent < 11 ? 0 : indexContent - 10; end = start === 0 ? 100 : indexContent + keyword.length + 90; - if (postContent && end > postContent.length) { - end = postContent.length; + if (handlePostContent && end > handlePostContent.length) { + end = handlePostContent.length; } const matchContent = diff --git a/test/e2e/search.test.js b/test/e2e/search.test.js index 8fc04adf6..0d0056ae2 100644 --- a/test/e2e/search.test.js +++ b/test/e2e/search.test.js @@ -254,7 +254,7 @@ console.log('Hello World'); await searchFieldElm.fill('searchHere'); // there is a newline after searchHere and the markdown part ```js ``` it should be removed expect(await resultsHeadingElm.textContent()).toContain( - "...searchHere\nconsole.log('Hello Worl...", + "...searchHere\nconsole.log('Hello World');...", ); });