From 85da492945ab495227240c42492afa8afe496738 Mon Sep 17 00:00:00 2001
From: koy <koy@ko8e24.top>
Date: Thu, 27 Jun 2024 22:18:03 +0800
Subject: [PATCH 1/4] fix(search): clean markdown elements in search contents

---
 src/plugins/search/markdown-to-txt.js | 61 +++++++++++++++++++++++++++
 src/plugins/search/search.js          | 27 +++++++-----
 test/e2e/search.test.js               | 19 +++++++++
 3 files changed, 97 insertions(+), 10 deletions(-)
 create mode 100644 src/plugins/search/markdown-to-txt.js

diff --git a/src/plugins/search/markdown-to-txt.js b/src/plugins/search/markdown-to-txt.js
new file mode 100644
index 000000000..dd3ee6d87
--- /dev/null
+++ b/src/plugins/search/markdown-to-txt.js
@@ -0,0 +1,61 @@
+/**
+ * This is a modified version of the
+ * [markdown-to-txt](https://www.npmjs.com/package/markdown-to-txt) library.
+ */
+import { marked } from 'marked';
+import { escape, unescape } from 'lodash';
+const block = text => text + '\n\n';
+const escapeBlock = text => escape(text) + '\n\n';
+const line = text => text + '\n';
+const inline = text => text;
+const newline = () => '\n';
+const empty = () => '';
+
+const TxtRenderer = {
+  // Block elements
+  code: escapeBlock,
+  blockquote: block,
+  html: empty,
+  heading: block,
+  hr: newline,
+  list: text => block(text.trim()),
+  listitem: line,
+  checkbox: empty,
+  paragraph: block,
+  table: (header, body) => line(header + body),
+  tablerow: text => line(text.trim()),
+  tablecell: text => text + ' ',
+  // Inline elements
+  strong: inline,
+  em: inline,
+  codespan: inline,
+  br: newline,
+  del: inline,
+  link: (_0, _1, text) => text,
+  image: (_0, _1, text) => text,
+  text: inline,
+  // etc.
+  options: {},
+};
+
+/**
+ * Converts markdown to plaintext using the marked Markdown library.
+ * Accepts [MarkedOptions](https://marked.js.org/using_advanced#options) as
+ * the second argument.
+ *
+ * NOTE: The output of markdownToTxt is NOT sanitized. The output may contain
+ * valid HTML, JavaScript, etc. Be sure to sanitize if the output is intended
+ * for web use.
+ *
+ * @param markdown the markdown text to txtify
+ * @param options  the marked options
+ * @returns the unmarked text
+ */
+export function markdownToTxt(markdown, options) {
+  const unmarked = marked(markdown, { ...options, renderer: TxtRenderer });
+  const unescaped = unescape(unmarked);
+  const trimmed = unescaped.trim();
+  return trimmed;
+}
+
+export default markdownToTxt;
diff --git a/src/plugins/search/search.js b/src/plugins/search/search.js
index 95aa83a8b..671fb9b3d 100644
--- a/src/plugins/search/search.js
+++ b/src/plugins/search/search.js
@@ -2,6 +2,7 @@ import {
   getAndRemoveConfig,
   getAndRemoveDocisfyIgnoreConfig,
 } from '../../core/render/utils.js';
+import { markdownToTxt } from './markdown-to-txt.js';
 
 let INDEXS = {};
 
@@ -34,6 +35,17 @@ function escapeHtml(string) {
   return String(string).replace(/[&<>"']/g, s => entityMap[s]);
 }
 
+function formatContent(text) {
+  return escapeHtml(cleanMarkdown(ignoreDiacriticalMarks(text)));
+}
+
+function cleanMarkdown(text) {
+  if (text) {
+    text = markdownToTxt(text);
+  }
+  return text;
+}
+
 function getAllPaths(router) {
   const paths = [];
 
@@ -175,19 +187,14 @@ export function search(query) {
       keywords.forEach(keyword => {
         // From https://github.com/sindresorhus/escape-string-regexp
         const regEx = new RegExp(
-          escapeHtml(ignoreDiacriticalMarks(keyword)).replace(
-            /[|\\{}()[\]^$+*?.]/g,
-            '\\$&',
-          ),
+          formatContent(keyword).replace(/[|\\{}()[\]^$+*?.]/g, '\\$&'),
           'gi',
         );
         let indexTitle = -1;
         let indexContent = -1;
-        handlePostTitle = postTitle
-          ? escapeHtml(ignoreDiacriticalMarks(postTitle))
-          : postTitle;
+        handlePostTitle = postTitle ? formatContent(postTitle) : postTitle;
         handlePostContent = postContent
-          ? escapeHtml(ignoreDiacriticalMarks(postContent))
+          ? formatContent(postContent)
           : postContent;
 
         indexTitle = postTitle ? handlePostTitle.search(regEx) : -1;
@@ -226,8 +233,8 @@ export function search(query) {
 
       if (matchesScore > 0) {
         const matchingPost = {
-          title: handlePostTitle,
-          content: postContent ? resultStr : '',
+          title: formatContent(handlePostTitle),
+          content: formatContent(postContent ? resultStr : ''),
           url: postUrl,
           score: matchesScore,
         };
diff --git a/test/e2e/search.test.js b/test/e2e/search.test.js
index ecf38ec2e..dac99396f 100644
--- a/test/e2e/search.test.js
+++ b/test/e2e/search.test.js
@@ -232,4 +232,23 @@ test.describe('Search Plugin Tests', () => {
     await page.keyboard.press('z');
     await expect(searchFieldElm).toBeFocused();
   });
+  test('search result should remove markdown', async ({ page }) => {
+    const docsifyInitConfig = {
+      markdown: {
+        homepage: `
+          # The [mock](example.com) link
+          There is lots of words.
+        `,
+      },
+      scriptURLs: ['/dist/plugins/search.js'],
+    };
+
+    const searchFieldElm = page.locator('input[type=search]');
+    const resultsHeadingElm = page.locator('.results-panel h2');
+
+    await docsifyInit(docsifyInitConfig);
+
+    await searchFieldElm.fill('There');
+    await expect(resultsHeadingElm).toHaveText('The mock link');
+  });
 });

From 144b4f809a2c18aee86cb3d3ee86a64cda42577f Mon Sep 17 00:00:00 2001
From: koy <koy@ko8e24.top>
Date: Thu, 1 Aug 2024 21:46:46 +0800
Subject: [PATCH 2/4] update: refactor

---
 src/plugins/search/component.js       |   2 +-
 src/plugins/search/markdown-to-txt.js | 240 ++++++++++++++++++++------
 src/plugins/search/search.js          |  32 ++--
 test/e2e/search.test.js               |  94 +++++++++-
 4 files changed, 290 insertions(+), 78 deletions(-)

diff --git a/src/plugins/search/component.js b/src/plugins/search/component.js
index 1fb4f36f2..8496454a5 100644
--- a/src/plugins/search/component.js
+++ b/src/plugins/search/component.js
@@ -53,7 +53,7 @@ function doSearch(value) {
       <div class="matching-post" aria-label="search result ${i + 1}">
         <a href="${post.url}">
           <p class="title clamp-1">${post.title}</p>
-          <p class="content clamp-2">${post.content}</p>
+          <p class="content clamp-2">...${post.content}...</p>
         </a>
       </div>
     `;
diff --git a/src/plugins/search/markdown-to-txt.js b/src/plugins/search/markdown-to-txt.js
index dd3ee6d87..90e042be6 100644
--- a/src/plugins/search/markdown-to-txt.js
+++ b/src/plugins/search/markdown-to-txt.js
@@ -1,61 +1,197 @@
 /**
- * This is a modified version of the
- * [markdown-to-txt](https://www.npmjs.com/package/markdown-to-txt) library.
+ * This is a function to convert markdown to txt based on markedjs v13+.
+ * Copies the escape/unescape functions from [lodash](https://www.npmjs.com/package/lodash) instead import to reduce the size.
  */
 import { marked } from 'marked';
-import { escape, unescape } from 'lodash';
-const block = text => text + '\n\n';
-const escapeBlock = text => escape(text) + '\n\n';
-const line = text => text + '\n';
-const inline = text => text;
-const newline = () => '\n';
-const empty = () => '';
-
-const TxtRenderer = {
-  // Block elements
-  code: escapeBlock,
-  blockquote: block,
-  html: empty,
-  heading: block,
-  hr: newline,
-  list: text => block(text.trim()),
-  listitem: line,
-  checkbox: empty,
-  paragraph: block,
-  table: (header, body) => line(header + body),
-  tablerow: text => line(text.trim()),
-  tablecell: text => text + ' ',
-  // Inline elements
-  strong: inline,
-  em: inline,
-  codespan: inline,
-  br: newline,
-  del: inline,
-  link: (_0, _1, text) => text,
-  image: (_0, _1, text) => text,
-  text: inline,
-  // etc.
-  options: {},
+
+const reEscapedHtml = /&(?:amp|lt|gt|quot|#(0+)?39);/g;
+const reHasEscapedHtml = RegExp(reEscapedHtml.source);
+const htmlUnescapes = {
+  '&amp;': '&',
+  '&lt;': '<',
+  '&gt;': '>',
+  '&quot;': '"',
+  '&#39;': "'",
 };
 
-/**
- * Converts markdown to plaintext using the marked Markdown library.
- * Accepts [MarkedOptions](https://marked.js.org/using_advanced#options) as
- * the second argument.
- *
- * NOTE: The output of markdownToTxt is NOT sanitized. The output may contain
- * valid HTML, JavaScript, etc. Be sure to sanitize if the output is intended
- * for web use.
- *
- * @param markdown the markdown text to txtify
- * @param options  the marked options
- * @returns the unmarked text
- */
-export function markdownToTxt(markdown, options) {
-  const unmarked = marked(markdown, { ...options, renderer: TxtRenderer });
+function unescape(string) {
+  return string && reHasEscapedHtml.test(string)
+    ? string.replace(reEscapedHtml, entity => htmlUnescapes[entity] || "'")
+    : string || '';
+}
+
+const reUnescapedHtml = /[&<>"']/g;
+const reHasUnescapedHtml = RegExp(reUnescapedHtml.source);
+const htmlEscapes = {
+  '&': '&amp;',
+  '<': '&lt;',
+  '>': '&gt;',
+  '"': '&quot;',
+  "'": '&#39;',
+};
+
+function escape(string) {
+  return string && reHasUnescapedHtml.test(string)
+    ? string.replace(reUnescapedHtml, chr => htmlEscapes[chr])
+    : string || '';
+}
+
+function helpersCleanup(string) {
+  return string && string.replace('!>', '').replace('?>', '');
+}
+
+const markdownToTxtRenderer = {
+  space() {
+    return '';
+  },
+
+  code({ text }) {
+    const code = text.replace(/\n$/, '');
+    return escape(code);
+  },
+
+  blockquote({ tokens }) {
+    return this.parser?.parse(tokens) || '';
+  },
+
+  html() {
+    return '';
+  },
+
+  heading({ tokens }) {
+    return this.parser?.parse(tokens) || '';
+  },
+
+  hr() {
+    return '';
+  },
+
+  list(token) {
+    let body = '';
+    for (let j = 0; j < token.items.length; j++) {
+      const item = token.items[j];
+      body += this.listitem?.(item);
+    }
+
+    return body;
+  },
+
+  listitem(item) {
+    let itemBody = '';
+    if (item.task) {
+      const checkbox = this.checkbox?.({ checked: !!item.checked });
+      if (item.loose) {
+        if (item.tokens.length > 0 && item.tokens[0].type === 'paragraph') {
+          item.tokens[0].text = checkbox + ' ' + item.tokens[0].text;
+          if (
+            item.tokens[0].tokens &&
+            item.tokens[0].tokens.length > 0 &&
+            item.tokens[0].tokens[0].type === 'text'
+          ) {
+            item.tokens[0].tokens[0].text =
+              checkbox + ' ' + item.tokens[0].tokens[0].text;
+          }
+        } else {
+          item.tokens.unshift({
+            type: 'text',
+            raw: checkbox + ' ',
+            text: checkbox + ' ',
+          });
+        }
+      } else {
+        itemBody += checkbox + ' ';
+      }
+    }
+
+    itemBody += this.parser?.parse(item.tokens, !!item.loose);
+
+    return `${itemBody || ''}`;
+  },
+
+  checkbox() {
+    return '';
+  },
+
+  paragraph({ tokens }) {
+    return this.parser?.parseInline(tokens) || '';
+  },
+
+  table(token) {
+    let header = '';
+
+    let cell = '';
+    for (let j = 0; j < token.header.length; j++) {
+      cell += this.tablecell?.(token.header[j]);
+    }
+    header += this.tablerow?.({ text: cell });
+
+    let body = '';
+    for (let j = 0; j < token.rows.length; j++) {
+      const row = token.rows[j];
+
+      cell = '';
+      for (let k = 0; k < row.length; k++) {
+        cell += this.tablecell?.(row[k]);
+      }
+
+      body += this.tablerow?.({ text: cell });
+    }
+
+    return header + ' ' + body;
+  },
+
+  tablerow({ text }) {
+    return text;
+  },
+
+  tablecell(token) {
+    return this.parser?.parseInline(token.tokens) || '';
+  },
+
+  strong({ text }) {
+    return text;
+  },
+
+  em({ tokens }) {
+    return this.parser?.parseInline(tokens) || '';
+  },
+
+  codespan({ text }) {
+    return text;
+  },
+
+  br() {
+    return ' ';
+  },
+
+  del({ tokens }) {
+    return this.parser?.parseInline(tokens);
+  },
+
+  link({ tokens, href, title }) {
+    // Remain the href and title attributes for searching, so is the image
+    // e.g. [filename](_media/example.js ':include :type=code :fragment=demo')
+    // Result: filename _media/example.js :include :type=code :fragment=demo
+    return `${this.parser?.parseInline(tokens) || ''} ${href || ''} ${title || ''}`;
+  },
+
+  image({ title, text, href }) {
+    return `${text || ''} ${href || ''} ${title || ''}`;
+  },
+
+  text(token) {
+    return token.tokens
+      ? this.parser?.parseInline(token.tokens) || ''
+      : token.text || '';
+  },
+};
+const _marked = marked.setOptions({ renderer: markdownToTxtRenderer });
+
+export function markdownToTxt(markdown) {
+  const unmarked = _marked.parse(markdown);
   const unescaped = unescape(unmarked);
-  const trimmed = unescaped.trim();
-  return trimmed;
+  const helpersCleaned = helpersCleanup(unescaped);
+  return helpersCleaned.trim();
 }
 
 export default markdownToTxt;
diff --git a/src/plugins/search/search.js b/src/plugins/search/search.js
index 8bd3fa805..487718bff 100644
--- a/src/plugins/search/search.js
+++ b/src/plugins/search/search.js
@@ -60,17 +60,6 @@ function escapeHtml(string) {
   return String(string).replace(/[&<>"']/g, s => entityMap[s]);
 }
 
-function formatContent(text) {
-  return escapeHtml(cleanMarkdown(ignoreDiacriticalMarks(text)));
-}
-
-function cleanMarkdown(text) {
-  if (text) {
-    text = markdownToTxt(text);
-  }
-  return text;
-}
-
 function getAllPaths(router) {
   const paths = [];
 
@@ -146,7 +135,7 @@ export function genIndex(path, content = '', router, depth, indexKey) {
         index[slug] = {
           slug,
           title: path !== '/' ? path.slice(1) : 'Home Page',
-          body: token.text || '',
+          body: markdownToTxt(token.text || ''),
           path: path,
           indexKey: indexKey,
         };
@@ -162,12 +151,12 @@ export function genIndex(path, content = '', router, depth, indexKey) {
         token.text = getTableData(token);
         token.text = getListData(token);
 
-        index[slug].body += '\n' + (token.text || '');
+        index[slug].body += '\n' + markdownToTxt(token.text || '');
       } else {
         token.text = getTableData(token);
         token.text = getListData(token);
 
-        index[slug].body = token.text || '';
+        index[slug].body = markdownToTxt(token.text || '');
       }
 
       index[slug].path = path;
@@ -211,14 +200,19 @@ export function search(query) {
       keywords.forEach(keyword => {
         // From https://github.com/sindresorhus/escape-string-regexp
         const regEx = new RegExp(
-          formatContent(keyword).replace(/[|\\{}()[\]^$+*?.]/g, '\\$&'),
+          escapeHtml(ignoreDiacriticalMarks(keyword)).replace(
+            /[|\\{}()[\]^$+*?.]/g,
+            '\\$&',
+          ),
           'gi',
         );
         let indexTitle = -1;
         let indexContent = -1;
-        handlePostTitle = postTitle ? formatContent(postTitle) : postTitle;
+        handlePostTitle = postTitle
+          ? escapeHtml(ignoreDiacriticalMarks(postTitle))
+          : postTitle;
         handlePostContent = postContent
-          ? formatContent(postContent)
+          ? escapeHtml(ignoreDiacriticalMarks(postContent))
           : postContent;
 
         indexTitle = postTitle ? handlePostTitle.search(regEx) : -1;
@@ -252,8 +246,8 @@ export function search(query) {
 
       if (matchesScore > 0) {
         const matchingPost = {
-          title: formatContent(handlePostTitle),
-          content: formatContent(postContent ? resultStr : ''),
+          title: handlePostTitle,
+          content: postContent ? resultStr : '',
           url: postUrl,
           score: matchesScore,
         };
diff --git a/test/e2e/search.test.js b/test/e2e/search.test.js
index ed73e6c2a..8fc04adf6 100644
--- a/test/e2e/search.test.js
+++ b/test/e2e/search.test.js
@@ -232,23 +232,105 @@ test.describe('Search Plugin Tests', () => {
     await page.keyboard.press('z');
     await expect(searchFieldElm).toBeFocused();
   });
-  test('search result should remove markdown', async ({ page }) => {
+  test('search result should remove markdown code block', async ({ page }) => {
     const docsifyInitConfig = {
       markdown: {
         homepage: `
-          # The [mock](example.com) link
-          There is lots of words.
+# Hello World
+
+searchHere
+\`\`\`js
+console.log('Hello World');
+\`\`\`
         `,
       },
       scriptURLs: ['/dist/plugins/search.js'],
     };
 
     const searchFieldElm = page.locator('input[type=search]');
-    const resultsHeadingElm = page.locator('.results-panel h2');
+    const resultsHeadingElm = page.locator('.results-panel .content');
 
     await docsifyInit(docsifyInitConfig);
+    await searchFieldElm.fill('searchHere');
+    // there is a newline after searchHere and the markdown part ```js ``` it should be removed
+    expect(await resultsHeadingElm.textContent()).toContain(
+      "...searchHere\nconsole.log('Hello Worl...",
+    );
+  });
 
-    await searchFieldElm.fill('There');
-    await expect(resultsHeadingElm).toHaveText('The mock link');
+  test('search result should remove file markdown and keep href attribution for files', async ({
+    page,
+  }) => {
+    const docsifyInitConfig = {
+      markdown: {
+        homepage: `
+# Hello World
+![filename](_media/example.js ':include :type=code :fragment=demo')
+        `,
+      },
+      scriptURLs: ['/dist/plugins/search.js'],
+    };
+
+    const searchFieldElm = page.locator('input[type=search]');
+    const resultsHeadingElm = page.locator('.results-panel .content');
+
+    await docsifyInit(docsifyInitConfig);
+    await searchFieldElm.fill('filename');
+    expect(await resultsHeadingElm.textContent()).toContain(
+      '...filename _media/example.js :include :type=code :fragment=demo...',
+    );
+  });
+
+  test('search result should remove checkbox markdown and keep related values', async ({
+    page,
+  }) => {
+    const docsifyInitConfig = {
+      markdown: {
+        homepage: `
+# Hello World
+         
+- [ ] Task 1
+- [x] SearchHere
+- [ ] Task 3
+          `,
+      },
+      scriptURLs: ['/dist/plugins/search.js'],
+    };
+
+    const searchFieldElm = page.locator('input[type=search]');
+    const resultsHeadingElm = page.locator('.results-panel .content');
+
+    await docsifyInit(docsifyInitConfig);
+    await searchFieldElm.fill('SearchHere');
+    // remove the checkbox markdown and keep the related values
+    expect(await resultsHeadingElm.textContent()).toContain(
+      '...Task 1 SearchHere Task 3...',
+    );
+  });
+
+  test('search result should remove docsify self helper markdown and keep related values', async ({
+    page,
+  }) => {
+    const docsifyInitConfig = {
+      markdown: {
+        homepage: `
+# Hello World
+
+!> SearchHere to check it!
+
+          `,
+      },
+      scriptURLs: ['/dist/plugins/search.js'],
+    };
+
+    const searchFieldElm = page.locator('input[type=search]');
+    const resultsHeadingElm = page.locator('.results-panel .content');
+
+    await docsifyInit(docsifyInitConfig);
+    await searchFieldElm.fill('SearchHere');
+    // remove the helper markdown and keep the related values
+    expect(await resultsHeadingElm.textContent()).toContain(
+      '...SearchHere to check it!...',
+    );
   });
 });

From 7fcc6bcbed94129bcbf486db2c81b7660e97d111 Mon Sep 17 00:00:00 2001
From: koy <koy@ko8e24.top>
Date: Thu, 1 Aug 2024 23:56:14 +0800
Subject: [PATCH 3/4] update: only append ellipses on content not empty

---
 src/plugins/search/component.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/plugins/search/component.js b/src/plugins/search/component.js
index 8496454a5..2bce54b97 100644
--- a/src/plugins/search/component.js
+++ b/src/plugins/search/component.js
@@ -49,11 +49,12 @@ function doSearch(value) {
 
   let html = '';
   matches.forEach((post, i) => {
+    const content = post.content ? `...${post.content}...` : '';
     html += /* html */ `
       <div class="matching-post" aria-label="search result ${i + 1}">
         <a href="${post.url}">
           <p class="title clamp-1">${post.title}</p>
-          <p class="content clamp-2">...${post.content}...</p>
+          <p class="content clamp-2">${content}</p>
         </a>
       </div>
     `;

From 09cf22133f61d1fb6eb416bf7081e5493e19242c Mon Sep 17 00:00:00 2001
From: koy <koy@ko8e24.top>
Date: Thu, 19 Sep 2024 13:32:41 +0800
Subject: [PATCH 4/4] update: fix mismatch search content

---
 src/plugins/search/search.js | 4 ++--
 test/e2e/search.test.js      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/plugins/search/search.js b/src/plugins/search/search.js
index 487718bff..d4e33e3c3 100644
--- a/src/plugins/search/search.js
+++ b/src/plugins/search/search.js
@@ -230,8 +230,8 @@ export function search(query) {
           start = indexContent < 11 ? 0 : indexContent - 10;
           end = start === 0 ? 100 : indexContent + keyword.length + 90;
 
-          if (postContent && end > postContent.length) {
-            end = postContent.length;
+          if (handlePostContent && end > handlePostContent.length) {
+            end = handlePostContent.length;
           }
 
           const matchContent =
diff --git a/test/e2e/search.test.js b/test/e2e/search.test.js
index 8fc04adf6..0d0056ae2 100644
--- a/test/e2e/search.test.js
+++ b/test/e2e/search.test.js
@@ -254,7 +254,7 @@ console.log('Hello World');
     await searchFieldElm.fill('searchHere');
     // there is a newline after searchHere and the markdown part ```js ``` it should be removed
     expect(await resultsHeadingElm.textContent()).toContain(
-      "...searchHere\nconsole.log('Hello Worl...",
+      "...searchHere\nconsole.log('Hello World');...",
     );
   });