Fix broken links in lists, indented text and multiline

When replacing links, we parse the markdown document line by line. When the markdown parsers encounters an indented line, without the context of the surrounding lines, it assumes a code block and does not render the links to html, which means we do not re-write those links. If the link is broken over two lines we also fail to discover it. This patch fixes the issue by changing the processing from line by line to the document as a whole. Fixes: #229 Signed-off-by: Andrea Frittoli <andrea.frittoli@gmail.com>
tektoncd · Feb 19, 2021 · 491f5c7 · 491f5c7
1 parent 1a2e500
commit 491f5c7
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 35 deletions.
diff --git a/sync/requirements.txt b/sync/requirements.txt
@@ -5,9 +5,9 @@ Jinja2==2.11.1
 google-auth==1.14.0
 urlopen==1.0.0
 markdown==3.1.1
-lxml==4.5.2
 coverage==5.3
 flake8==3.8.3
 click>=7.1.2
 gitpython>=3.1.11
-gitdb-speedups>=0.1.0
+gitdb-speedups>=0.1.0
+beautifulsoup4==4.9.3
diff --git a/sync/sync.py b/sync/sync.py
@@ -26,15 +26,16 @@
 from multiprocessing import Pool
 import os
 import os.path
+import re
 import sys
 from urllib.error import URLError
 from urllib.parse import urlparse, urljoin, urlunparse
 
+from bs4 import BeautifulSoup
 import click
 import git
 from jinja2 import Environment
 from jinja2 import FileSystemLoader
-from lxml import etree
 from ruamel.yaml import YAML
 
 
@@ -49,7 +50,9 @@
 
 jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
 
+FM_BOUNDARY = re.compile(r"^(?:<!--\n)?-{3,}\s*$(?:\n-->)?", re.MULTILINE)
 YAML_SEPARATOR = "---\n"
+
 FOLDER_INDEX = '_index.md'
 
 
@@ -176,13 +179,13 @@ def transform_doc(doc, source_folder, target, target_folder, header,
     target = os.path.join(site_target_folder, target)
     with open(target, 'w+') as target_doc:
         # If there is an header configured, write it (in YAML)
-        if header:
-            target_doc.write(YAML_SEPARATOR)
-            YAML().dump(header, target_doc)
-            target_doc.write(YAML_SEPARATOR)
-        for line in decode(doc.data_stream.read()).splitlines():
-            target_doc.write(
-                f'{transform_line(line, source_folder, local_files, base_path, base_url)}\n')
+        doc_all = decode(doc.data_stream.read())
+        doc_markdown, fm = read_front_matter(doc_all)
+        # Update the doc front matter with the configured one and write it
+        write_front_matter(target_doc, fm, header)
+        doc_markdown = transform_links_doc(
+            doc_markdown, source_folder, local_files, base_path, base_url)
+        target_doc.write(doc_markdown)
     return target
 
 
@@ -195,29 +198,45 @@ def decode(s, encodings=('utf8', 'latin1', 'ascii')):
     return s.decode('ascii', 'ignore')
 
 
-def transform_line(line, base_path, local_files, rewrite_path, rewrite_url):
-    """ transform all the links in one line """
-    line = line.rstrip()
-    links = get_links(line)
-    # If there are links in this line we may need to fix them
-    for link in links:
-        # link contains the text and href
-        href =link.get("href")
-        href_mod = transform_link(href, base_path, local_files, rewrite_path, rewrite_url)
-        line = line.replace(href, href_mod)
-    return line
+def read_front_matter(text):
+    """ returns a tuple text, frontmatter (as dict) """
+    if FM_BOUNDARY.match(text):
+        try:
+            _, fm, content = FM_BOUNDARY.split(text, 2)
+        except ValueError:
+            # Not enough values to unpack, boundary was matched once
+            return text, None
+        if content.startswith('\n'):
+            content = content[1:]
+        return content, YAML().load(fm)
+    else:
+        return text, None
+
+def write_front_matter(target_doc, fm_doc, fm_config):
+    fm_doc = fm_doc or {}
+    fm_config = fm_config or {}
+    fm_doc.update(fm_config)
+    if fm_doc:
+        target_doc.write(YAML_SEPARATOR)
+        YAML().dump(fm_doc, target_doc)
+        target_doc.write(YAML_SEPARATOR)
+
+def transform_links_doc(text, base_path, local_files, rewrite_path, rewrite_url):
+    """ transform all the links the text """
+    links = get_links(text)
+    # Rewrite map, only use links with an href
+    rewrite_map = {x.get("href"): transform_link(x.get("href"), base_path, local_files, rewrite_path, rewrite_url)
+        for x in links if x.get("href")}
+    for source, target in rewrite_map.items():
+        text = text.replace(source, target)
+    return text
 
 
 def get_links(md):
     """ return a list of all the links in a string formatted in markdown """
     md = markdown.markdown(md)
-    try:
-        doc = etree.fromstring(md)
-        return doc.xpath('//a')
-    except etree.XMLSyntaxError:
-        pass
-
-    return []
+    soup = BeautifulSoup(md, 'html.parser')
+    return soup.find_all("a")
 
 
 def transform_link(link, base_path, local_files, rewrite_path, rewrite_url):

diff --git a/sync/test_sync.py b/sync/test_sync.py
@@ -28,8 +28,8 @@
 from sync import (
     doc_config, docs_from_tree, get_links, is_absolute_url,
     is_fragment, get_tags, load_config, save_config,
-    get_files_in_path, transform_link, transform_line,
-    transform_doc, transform_docs)
+    get_files_in_path, transform_link, transform_links_doc,
+    transform_doc, transform_docs, read_front_matter)
 
 
 BASE_FOLDER = os.path.dirname(os.path.abspath(__file__))
@@ -257,7 +257,7 @@ def test_transform_link(self):
                 transform_link(case, base_path, local_files, rewrite_path, rewrite_url),
                 expected)
 
-    def test_transform_line(self):
+    def test_transform_links_doc(self):
         self.maxDiff = None
 
         # Links are in a page stored undrer base_path
@@ -282,7 +282,9 @@ def test_transform_line(self):
             "[notfound-relative-link-dotdot](../examples/notfound.txt)",
             "[invalid-absolute-link](www.github.com)",
             ("[valid-absolute-link](https://website-random321.net#FRagment) "
-             "[valid-ref-link](#fooTEr)")
+             "[valid-ref-link](#fooTEr)"),
+            ("Valid link broken on two lines [exists-link-in-list]("
+            "./test.txt)")
         ]
         expected_results = [
             "[exists-relative-link](/docs/test/test.txt)",
@@ -295,15 +297,34 @@ def test_transform_line(self):
             "[notfound-relative-link-dotdot](http://test.com/tree/docs/examples/notfound.txt)",
             "[invalid-absolute-link](http://test.com/tree/docs/www.github.com)",
             ("[valid-absolute-link](https://website-random321.net#FRagment) "
-             "[valid-ref-link](#footer)")
+             "[valid-ref-link](#footer)"),
+            ("Valid link broken on two lines [exists-link-in-list]("
+            "/docs/test/test.txt)")
         ]
 
         for case, expected in zip(cases, expected_results):
-            actual = transform_line(
-                line=case, base_path=base_path, local_files=local_files,
+            actual = transform_links_doc(
+                text=case, base_path=base_path, local_files=local_files,
                 rewrite_path='/docs/test', rewrite_url='http://test.com/tree/docs/test'
             )
 
+    def test_read_front_matter(self):
+        cases = [
+            'abc',
+            '---\ntest1',
+            '---\ntest1: abc\ntest2: 1\n---\nabc',
+            '<!--\n---\ntest1: abc\ntest2: 1\n---\n-->\nabc'
+        ]
+        expected = [
+            ('abc', None),
+            ('---\ntest1', None),
+            ('abc', {"test1": "abc", "test2": 1}),
+            ('abc', {"test1": "abc", "test2": 1})
+        ]
+        for case, exp in zip(cases, expected):
+            actual = read_front_matter(case)
+            self.assertEqual(actual, exp)
+
     def test_transform_doc(self):
         header = dict(test1='abc', test2=1, test3=True)
         with tempfile.TemporaryDirectory() as site_dir: