Skip to content

Commit

Permalink
Fix broken links in lists, indented text and multiline
Browse files Browse the repository at this point in the history
When replacing links, we parse the markdown document line by line. When
the markdown parsers encounters an indented line, without the context of
the surrounding lines, it assumes a code block and does not render the
links to html, which means we do not re-write those links.

If the link is broken over two lines we also fail to discover it.
This patch fixes the issue by changing the processing from line by line
to the document as a whole.

Fixes: #229

Signed-off-by: Andrea Frittoli <andrea.frittoli@gmail.com>
  • Loading branch information
afrittoli committed Feb 19, 2021
1 parent 1a2e500 commit 491f5c7
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 35 deletions.
4 changes: 2 additions & 2 deletions sync/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ Jinja2==2.11.1
google-auth==1.14.0
urlopen==1.0.0
markdown==3.1.1
lxml==4.5.2
coverage==5.3
flake8==3.8.3
click>=7.1.2
gitpython>=3.1.11
gitdb-speedups>=0.1.0
gitdb-speedups>=0.1.0
beautifulsoup4==4.9.3
71 changes: 45 additions & 26 deletions sync/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,16 @@
from multiprocessing import Pool
import os
import os.path
import re
import sys
from urllib.error import URLError
from urllib.parse import urlparse, urljoin, urlunparse

from bs4 import BeautifulSoup
import click
import git
from jinja2 import Environment
from jinja2 import FileSystemLoader
from lxml import etree
from ruamel.yaml import YAML


Expand All @@ -49,7 +50,9 @@

jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))

FM_BOUNDARY = re.compile(r"^(?:<!--\n)?-{3,}\s*$(?:\n-->)?", re.MULTILINE)
YAML_SEPARATOR = "---\n"

FOLDER_INDEX = '_index.md'


Expand Down Expand Up @@ -176,13 +179,13 @@ def transform_doc(doc, source_folder, target, target_folder, header,
target = os.path.join(site_target_folder, target)
with open(target, 'w+') as target_doc:
# If there is an header configured, write it (in YAML)
if header:
target_doc.write(YAML_SEPARATOR)
YAML().dump(header, target_doc)
target_doc.write(YAML_SEPARATOR)
for line in decode(doc.data_stream.read()).splitlines():
target_doc.write(
f'{transform_line(line, source_folder, local_files, base_path, base_url)}\n')
doc_all = decode(doc.data_stream.read())
doc_markdown, fm = read_front_matter(doc_all)
# Update the doc front matter with the configured one and write it
write_front_matter(target_doc, fm, header)
doc_markdown = transform_links_doc(
doc_markdown, source_folder, local_files, base_path, base_url)
target_doc.write(doc_markdown)
return target


Expand All @@ -195,29 +198,45 @@ def decode(s, encodings=('utf8', 'latin1', 'ascii')):
return s.decode('ascii', 'ignore')


def transform_line(line, base_path, local_files, rewrite_path, rewrite_url):
""" transform all the links in one line """
line = line.rstrip()
links = get_links(line)
# If there are links in this line we may need to fix them
for link in links:
# link contains the text and href
href =link.get("href")
href_mod = transform_link(href, base_path, local_files, rewrite_path, rewrite_url)
line = line.replace(href, href_mod)
return line
def read_front_matter(text):
""" returns a tuple text, frontmatter (as dict) """
if FM_BOUNDARY.match(text):
try:
_, fm, content = FM_BOUNDARY.split(text, 2)
except ValueError:
# Not enough values to unpack, boundary was matched once
return text, None
if content.startswith('\n'):
content = content[1:]
return content, YAML().load(fm)
else:
return text, None

def write_front_matter(target_doc, fm_doc, fm_config):
fm_doc = fm_doc or {}
fm_config = fm_config or {}
fm_doc.update(fm_config)
if fm_doc:
target_doc.write(YAML_SEPARATOR)
YAML().dump(fm_doc, target_doc)
target_doc.write(YAML_SEPARATOR)

def transform_links_doc(text, base_path, local_files, rewrite_path, rewrite_url):
""" transform all the links the text """
links = get_links(text)
# Rewrite map, only use links with an href
rewrite_map = {x.get("href"): transform_link(x.get("href"), base_path, local_files, rewrite_path, rewrite_url)
for x in links if x.get("href")}
for source, target in rewrite_map.items():
text = text.replace(source, target)
return text


def get_links(md):
""" return a list of all the links in a string formatted in markdown """
md = markdown.markdown(md)
try:
doc = etree.fromstring(md)
return doc.xpath('//a')
except etree.XMLSyntaxError:
pass

return []
soup = BeautifulSoup(md, 'html.parser')
return soup.find_all("a")


def transform_link(link, base_path, local_files, rewrite_path, rewrite_url):
Expand Down
35 changes: 28 additions & 7 deletions sync/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from sync import (
doc_config, docs_from_tree, get_links, is_absolute_url,
is_fragment, get_tags, load_config, save_config,
get_files_in_path, transform_link, transform_line,
transform_doc, transform_docs)
get_files_in_path, transform_link, transform_links_doc,
transform_doc, transform_docs, read_front_matter)


BASE_FOLDER = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -257,7 +257,7 @@ def test_transform_link(self):
transform_link(case, base_path, local_files, rewrite_path, rewrite_url),
expected)

def test_transform_line(self):
def test_transform_links_doc(self):
self.maxDiff = None

# Links are in a page stored undrer base_path
Expand All @@ -282,7 +282,9 @@ def test_transform_line(self):
"[notfound-relative-link-dotdot](../examples/notfound.txt)",
"[invalid-absolute-link](www.github.com)",
("[valid-absolute-link](https://website-random321.net#FRagment) "
"[valid-ref-link](#fooTEr)")
"[valid-ref-link](#fooTEr)"),
("Valid link broken on two lines [exists-link-in-list]("
"./test.txt)")
]
expected_results = [
"[exists-relative-link](/docs/test/test.txt)",
Expand All @@ -295,15 +297,34 @@ def test_transform_line(self):
"[notfound-relative-link-dotdot](http://test.com/tree/docs/examples/notfound.txt)",
"[invalid-absolute-link](http://test.com/tree/docs/www.github.com)",
("[valid-absolute-link](https://website-random321.net#FRagment) "
"[valid-ref-link](#footer)")
"[valid-ref-link](#footer)"),
("Valid link broken on two lines [exists-link-in-list]("
"/docs/test/test.txt)")
]

for case, expected in zip(cases, expected_results):
actual = transform_line(
line=case, base_path=base_path, local_files=local_files,
actual = transform_links_doc(
text=case, base_path=base_path, local_files=local_files,
rewrite_path='/docs/test', rewrite_url='http://test.com/tree/docs/test'
)

def test_read_front_matter(self):
cases = [
'abc',
'---\ntest1',
'---\ntest1: abc\ntest2: 1\n---\nabc',
'<!--\n---\ntest1: abc\ntest2: 1\n---\n-->\nabc'
]
expected = [
('abc', None),
('---\ntest1', None),
('abc', {"test1": "abc", "test2": 1}),
('abc', {"test1": "abc", "test2": 1})
]
for case, exp in zip(cases, expected):
actual = read_front_matter(case)
self.assertEqual(actual, exp)

def test_transform_doc(self):
header = dict(test1='abc', test2=1, test3=True)
with tempfile.TemporaryDirectory() as site_dir:
Expand Down

0 comments on commit 491f5c7

Please sign in to comment.