From 7b454ae4b61ddddee1a51a8fe14f4583bdce000a Mon Sep 17 00:00:00 2001
From: robertu <4065233+robertu7@users.noreply.github.com>
Date: Mon, 11 Mar 2024 13:29:28 +0800
Subject: [PATCH 1/4] feat: add tests for normalizeCommentHTML

---
 package.json                       |  2 +-
 src/transformers/normalize.test.ts | 72 +++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 3 deletions(-)
diff --git a/package.json b/package.json
index 750bb11..5ff5148 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@matters/matters-editor",
-  "version": "0.2.3-alpha.0",
+  "version": "0.2.3-alpha.1",
   "description": "Editor for matters.news",
   "author": "https://github.com/thematters",
   "homepage": "https://github.com/thematters/matters-editor",
diff --git a/src/transformers/normalize.test.ts b/src/transformers/normalize.test.ts
index e47c019..f431407 100644
--- a/src/transformers/normalize.test.ts
+++ b/src/transformers/normalize.test.ts
@@ -1,16 +1,21 @@
 import { describe, expect, test } from 'vitest'
 
-import { normalizeArticleHTML } from './normalize'
+import { normalizeArticleHTML, normalizeCommentHTML } from './normalize'
 
 const expectNormalizeArticleHTML = (input: string, output: string) => {
   const result = normalizeArticleHTML(input)
   expect(result.trim()).toBe(output)
 }
 
+export const expectNormalizeCommentHTML = (input: string, output: string) => {
+  const result = normalizeCommentHTML(input)
+  expect(result.trim()).toBe(output)
+}
+
 /**
  * Tests
  */
-describe('Normalization', () => {
+describe('Normalization: Article', () => {
   test('bolds', () => {
     expectNormalizeArticleHTML(
       '<p><strong>abc</strong></p>',
@@ -279,3 +284,66 @@ describe('Normalization', () => {
     )
   })
 })
+
+describe('Normalization: Comment', () => {
+  test('bolds is not supported', () => {
+    expectNormalizeCommentHTML('<p><strong>abc</strong></p>', '<p>abc</p>')
+    expectNormalizeCommentHTML('<p><b>abc</b></p>', '<p>abc</p>')
+  })
+
+  test('strikethrough is not supported', () => {
+    expectNormalizeCommentHTML('<p><s>abc</s></p>', '<p>abc</p>')
+    expectNormalizeCommentHTML('<p><del>abc</del></p>', '<p>abc</p>')
+    expectNormalizeCommentHTML('<p><strike>abc</strike></p>', '<p>abc</p>')
+  })
+
+  test('italic is not supported', () => {
+    expectNormalizeCommentHTML('<p>abc</p>', '<p>abc</p>')
+
+    expectNormalizeCommentHTML('<p><i>abc</i></p>', '<p>abc</p>')
+  })
+
+  test('underline is not supported', () => {
+    expectNormalizeCommentHTML('<p><u>abc</u></p>', '<p>abc</p>')
+  })
+
+  test('self-closed tags', () => {
+    expectNormalizeCommentHTML('<p />', '<p></p>')
+
+    expectNormalizeCommentHTML('<br></br>', '<p><br class="smart"></p>')
+
+    expectNormalizeCommentHTML('<hr/>', '<p></p>')
+
+    // <img /> -> <img>
+    expectNormalizeCommentHTML(
+      '<figure class="image"><img src="https://assets.matters.news/embed/c40d5045-0c03-44b6-afe6-93a285ffd1bb.jpeg" /><figcaption>左：女反派。右：女主。</figcaption></figure>',
+      '<p>左：女反派。右：女主。</p>',
+    )
+
+    // <iframe /> -> <iframe></iframe>
+    expectNormalizeCommentHTML(
+      '<figure class="embed" data-provider="youtube"><div class="iframe-container"><iframe src="https://www.youtube.com/embed/Zk7DppcfaMY?rel=0" loading="lazy" allowfullscreen frameborder="0" /></div><figcaption></figcaption></figure>',
+      '<p></p>',
+    )
+  })
+
+  test('figures are not supported', () => {
+    // image
+    expectNormalizeCommentHTML(
+      '<figure class="image"><img src="https://assets.matters.news/embed/c40d5045-0c03-44b6-afe6-93a285ffd1bb.jpeg"><figcaption>左：女反派。右：女主。</figcaption></figure>',
+      '<p>左：女反派。右：女主。</p>',
+    )
+
+    // audio
+    expectNormalizeCommentHTML(
+      '<figure class="audio"><audio controls><source src="https://assets.matters.news/embedaudio/0a45d56a-d19a-4300-bfa4-305639fd5a82/點數經濟-讓過路客成為回頭客.mp3" type="audio/mp3"></audio><div class="player"><header><div class="meta"><h4 class="title">點數經濟：讓過路客成為回頭客</h4><div class="time"><span class="current" data-time="00:00"></span><span class="duration" data-time="--:--"></span></div></div><span class="play"></span></header><footer><div class="progress-bar"><span></span></div></footer></div><figcaption>區塊勢 Podcast</figcaption></figure>',
+      '<p>點數經濟：讓過路客成為回頭客</p><p>區塊勢 Podcast</p>',
+    )
+
+    // video
+    expectNormalizeCommentHTML(
+      '<figure class="embed embed-video" data-provider="youtube"><div class="iframe-container"><iframe src="https://www.youtube.com/embed/Zk7DppcfaMY?rel=0" loading="lazy" allowfullscreen frameborder="0"></iframe></div><figcaption></figcaption></figure>',
+      '<p></p>',
+    )
+  })
+})

From a32f702311cd5f1d8467a63ee97ee12bc46e3680 Mon Sep 17 00:00:00 2001
From: robertu <4065233+robertu7@users.noreply.github.com>
Date: Mon, 11 Mar 2024 13:32:09 +0800
Subject: [PATCH 2/4] feat: add more tests to normalizeCommentHTML

---
 src/transformers/normalize.test.ts | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/transformers/normalize.test.ts b/src/transformers/normalize.test.ts
index f431407..95c9f6a 100644
--- a/src/transformers/normalize.test.ts
+++ b/src/transformers/normalize.test.ts
@@ -286,6 +286,20 @@ describe('Normalization: Article', () => {
 })
 
 describe('Normalization: Comment', () => {
+  test('quote', () => {
+    expectNormalizeCommentHTML(
+      '<blockquote><p>abc</p></blockquote>',
+      '<blockquote><p>abc</p></blockquote>',
+    )
+  })
+
+  test('link', () => {
+    expectNormalizeCommentHTML(
+      '<p><a target="_blank" rel="noopener noreferrer nofollow" href="https://example.com">abc</a></p>',
+      '<p><a target="_blank" rel="noopener noreferrer nofollow" href="https://example.com">abc</a></p>',
+    )
+  })
+
   test('bolds is not supported', () => {
     expectNormalizeCommentHTML('<p><strong>abc</strong></p>', '<p>abc</p>')
     expectNormalizeCommentHTML('<p><b>abc</b></p>', '<p>abc</p>')

From 64881c566fff22e6f898b984c947d67042285f47 Mon Sep 17 00:00:00 2001
From: robertu <4065233+robertu7@users.noreply.github.com>
Date: Mon, 11 Mar 2024 15:53:08 +0800
Subject: [PATCH 3/4] feat: add tests for current empty lines processor

---
 src/transformers/sanitize.test.ts | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/transformers/sanitize.test.ts b/src/transformers/sanitize.test.ts
index 8de3b07..12db7d6 100644
--- a/src/transformers/sanitize.test.ts
+++ b/src/transformers/sanitize.test.ts
@@ -1,3 +1,4 @@
+import { stripIndent } from 'common-tags'
 import { describe, expect, test } from 'vitest'
 
 import { sanitizeHTML } from './sanitize'
@@ -18,6 +19,25 @@ describe('Sanitization: custom', () => {
       '<a class="">pp</a>',
     )
   })
+
+  test('allow max two empty paragraphys', () => {
+    expectSanitizeHTML(
+      '<p>abc</p><p></p><p></p><p></p><p>abc</p><p></p><p></p><p></p><p></p><p></p><p></p>',
+      stripIndent`
+        <p>abc</p>
+        <p></p>
+        <p></p>
+        <p></p>
+        <p>abc</p>
+        <p></p>
+        <p></p>
+        <p></p>
+        <p></p>
+        <p></p>
+        <p></p>
+      `,
+    )
+  })
 })
 
 // via https://github.com/leizongmin/js-xss/blob/master/test/test_xss.js

From cd42355ce79a74f21f879d0c328fa7cba495df8c Mon Sep 17 00:00:00 2001
From: robertu <4065233+robertu7@users.noreply.github.com>
Date: Mon, 11 Mar 2024 19:15:50 +0800
Subject: [PATCH 4/4] feat: squeeze empty paragraphs to max 2

---
 src/transformers/sanitize.test.ts | 18 +++++++-
 src/transformers/sanitize.ts      | 72 ++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/src/transformers/sanitize.test.ts b/src/transformers/sanitize.test.ts
index 12db7d6..d46029f 100644
--- a/src/transformers/sanitize.test.ts
+++ b/src/transformers/sanitize.test.ts
@@ -22,17 +22,33 @@ describe('Sanitization: custom', () => {
 
   test('allow max two empty paragraphys', () => {
     expectSanitizeHTML(
-      '<p>abc</p><p></p><p></p><p></p><p>abc</p><p></p><p></p><p></p><p></p><p></p><p></p>',
       stripIndent`
         <p>abc</p>
         <p></p>
         <p></p>
+        abc
         <p></p>
         <p>abc</p>
         <p></p>
         <p></p>
         <p></p>
+        <p>abc</p>
+        <p></p>
+        <p><br></p>
+        <p><br/></p>
+        <p><br></br></p>
+        <p><br/><br/><br/></p>
+        <p></p>
+      `,
+      stripIndent`
+        <p>abc</p>
         <p></p>
+        <p></p>abc
+        <p></p>
+        <p>abc</p>
+        <p></p>
+        <p></p>
+        <p>abc</p>
         <p></p>
         <p></p>
       `,
diff --git a/src/transformers/sanitize.ts b/src/transformers/sanitize.ts
index 37ab3af..4dd59fc 100644
--- a/src/transformers/sanitize.ts
+++ b/src/transformers/sanitize.ts
@@ -1,3 +1,4 @@
+import { type Root, type RootContent } from 'hast'
 import rehypeFormat from 'rehype-format'
 import rehypeParse from 'rehype-parse'
 import rehypeRaw from 'rehype-raw'
@@ -11,14 +12,81 @@ import {
   rehypeStringifyOptions,
 } from './options'
 
+/**
+ * Squeeze empty paragraphs to a maximum of N
+ *
+ * e.g.
+ * <p></p><p></p><p></p><p></p><p></p><p></p>
+ * =>
+ * <p></p><p></p>
+ *
+ * @param {number} maxCount
+ */
+const rehypeSqueezeParagraphs =
+  ({ maxCount }: { maxCount: number }) =>
+  (tree: Root) => {
+    if (tree.type !== 'root') {
+      return
+    }
+
+    const children: RootContent[] = []
+    let count = 0
+    let touched = false
+
+    tree.children.forEach((node) => {
+      // skip empty text nodes
+      if (node.type === 'text' && node.value.replace(/\s/g, '') === '') {
+        children.push(node)
+        return
+      }
+
+      // skip non-paragraph nodes
+      if (node.type !== 'element' || node.tagName !== 'p') {
+        count = 0
+        children.push(node)
+        return
+      }
+
+      // skip non-empty paragraphs:
+      // - <p></p>
+      // - <p><br/></p>
+      const isEmptyParagraph =
+        node.children.length === 0 ||
+        node.children.every((n) => n.type === 'element' && n.tagName === 'br')
+      if (!isEmptyParagraph) {
+        count = 0
+        children.push(node)
+        return
+      }
+
+      // cap empty paragraphs
+      count++
+      if (count <= maxCount) {
+        children.push({
+          type: 'element',
+          tagName: 'p',
+          properties: {},
+          children: [],
+        })
+      } else {
+        touched = true
+      }
+    })
+
+    if (touched) {
+      tree.children = children
+    }
+  }
+
 const formatter = unified()
   .use(rehypeParse, rehypeParseOptions)
   .use(rehypeRaw)
   .use(rehypeSanitize, rehypeSanitizeOptions)
+  .use(rehypeSqueezeParagraphs, { maxCount: 2 })
   .use(rehypeFormat)
   .use(rehypeStringify, rehypeStringifyOptions)
 
-export const sanitizeHTML = (md: string): string => {
-  const result = formatter.processSync(md)
+export const sanitizeHTML = (html: string): string => {
+  const result = formatter.processSync(html)
   return String(result)
 }