Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix broken links in lists and indented text #248

Merged
merged 1 commit into from
Feb 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sync/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ Jinja2==2.11.1
google-auth==1.14.0
urlopen==1.0.0
markdown==3.1.1
lxml==4.5.2
coverage==5.3
flake8==3.8.3
click>=7.1.2
gitpython>=3.1.11
gitdb-speedups>=0.1.0
gitdb-speedups>=0.1.0
beautifulsoup4==4.9.3
71 changes: 45 additions & 26 deletions sync/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,16 @@
from multiprocessing import Pool
import os
import os.path
import re
import sys
from urllib.error import URLError
from urllib.parse import urlparse, urljoin, urlunparse

from bs4 import BeautifulSoup
import click
import git
from jinja2 import Environment
from jinja2 import FileSystemLoader
from lxml import etree
from ruamel.yaml import YAML


Expand All @@ -49,7 +50,9 @@

jinja_env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))

FM_BOUNDARY = re.compile(r"^(?:<!--\n)?-{3,}\s*$(?:\n-->)?", re.MULTILINE)
YAML_SEPARATOR = "---\n"

FOLDER_INDEX = '_index.md'


Expand Down Expand Up @@ -176,13 +179,13 @@ def transform_doc(doc, source_folder, target, target_folder, header,
target = os.path.join(site_target_folder, target)
with open(target, 'w+') as target_doc:
# If there is an header configured, write it (in YAML)
if header:
target_doc.write(YAML_SEPARATOR)
YAML().dump(header, target_doc)
target_doc.write(YAML_SEPARATOR)
for line in decode(doc.data_stream.read()).splitlines():
target_doc.write(
f'{transform_line(line, source_folder, local_files, base_path, base_url)}\n')
doc_all = decode(doc.data_stream.read())
doc_markdown, fm = read_front_matter(doc_all)
# Update the doc front matter with the configured one and write it
write_front_matter(target_doc, fm, header)
doc_markdown = transform_links_doc(
doc_markdown, source_folder, local_files, base_path, base_url)
target_doc.write(doc_markdown)
return target


Expand All @@ -195,29 +198,45 @@ def decode(s, encodings=('utf8', 'latin1', 'ascii')):
return s.decode('ascii', 'ignore')


def transform_line(line, base_path, local_files, rewrite_path, rewrite_url):
""" transform all the links in one line """
line = line.rstrip()
links = get_links(line)
# If there are links in this line we may need to fix them
for link in links:
# link contains the text and href
href =link.get("href")
href_mod = transform_link(href, base_path, local_files, rewrite_path, rewrite_url)
line = line.replace(href, href_mod)
return line
def read_front_matter(text):
""" returns a tuple text, frontmatter (as dict) """
if FM_BOUNDARY.match(text):
try:
_, fm, content = FM_BOUNDARY.split(text, 2)
except ValueError:
# Not enough values to unpack, boundary was matched once
return text, None
if content.startswith('\n'):
content = content[1:]
return content, YAML().load(fm)
else:
return text, None

def write_front_matter(target_doc, fm_doc, fm_config):
fm_doc = fm_doc or {}
fm_config = fm_config or {}
fm_doc.update(fm_config)
if fm_doc:
target_doc.write(YAML_SEPARATOR)
YAML().dump(fm_doc, target_doc)
target_doc.write(YAML_SEPARATOR)

def transform_links_doc(text, base_path, local_files, rewrite_path, rewrite_url):
""" transform all the links the text """
links = get_links(text)
# Rewrite map, only use links with an href
rewrite_map = {x.get("href"): transform_link(x.get("href"), base_path, local_files, rewrite_path, rewrite_url)
for x in links if x.get("href")}
for source, target in rewrite_map.items():
text = text.replace(source, target)
return text


def get_links(md):
""" return a list of all the links in a string formatted in markdown """
md = markdown.markdown(md)
try:
doc = etree.fromstring(md)
return doc.xpath('//a')
except etree.XMLSyntaxError:
pass

return []
soup = BeautifulSoup(md, 'html.parser')
return soup.find_all("a")


def transform_link(link, base_path, local_files, rewrite_path, rewrite_url):
Expand Down
35 changes: 28 additions & 7 deletions sync/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
from sync import (
doc_config, docs_from_tree, get_links, is_absolute_url,
is_fragment, get_tags, load_config, save_config,
get_files_in_path, transform_link, transform_line,
transform_doc, transform_docs)
get_files_in_path, transform_link, transform_links_doc,
transform_doc, transform_docs, read_front_matter)


BASE_FOLDER = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -257,7 +257,7 @@ def test_transform_link(self):
transform_link(case, base_path, local_files, rewrite_path, rewrite_url),
expected)

def test_transform_line(self):
def test_transform_links_doc(self):
self.maxDiff = None

# Links are in a page stored undrer base_path
Expand All @@ -282,7 +282,9 @@ def test_transform_line(self):
"[notfound-relative-link-dotdot](../examples/notfound.txt)",
"[invalid-absolute-link](www.github.com)",
("[valid-absolute-link](https://website-random321.net#FRagment) "
"[valid-ref-link](#fooTEr)")
"[valid-ref-link](#fooTEr)"),
("Valid link broken on two lines [exists-link-in-list]("
"./test.txt)")
]
expected_results = [
"[exists-relative-link](/docs/test/test.txt)",
Expand All @@ -295,15 +297,34 @@ def test_transform_line(self):
"[notfound-relative-link-dotdot](http://test.com/tree/docs/examples/notfound.txt)",
"[invalid-absolute-link](http://test.com/tree/docs/www.github.com)",
("[valid-absolute-link](https://website-random321.net#FRagment) "
"[valid-ref-link](#footer)")
"[valid-ref-link](#footer)"),
("Valid link broken on two lines [exists-link-in-list]("
"/docs/test/test.txt)")
]

for case, expected in zip(cases, expected_results):
actual = transform_line(
line=case, base_path=base_path, local_files=local_files,
actual = transform_links_doc(
text=case, base_path=base_path, local_files=local_files,
rewrite_path='/docs/test', rewrite_url='http://test.com/tree/docs/test'
)

def test_read_front_matter(self):
cases = [
'abc',
'---\ntest1',
'---\ntest1: abc\ntest2: 1\n---\nabc',
'<!--\n---\ntest1: abc\ntest2: 1\n---\n-->\nabc'
]
expected = [
('abc', None),
('---\ntest1', None),
('abc', {"test1": "abc", "test2": 1}),
('abc', {"test1": "abc", "test2": 1})
]
for case, exp in zip(cases, expected):
actual = read_front_matter(case)
self.assertEqual(actual, exp)

def test_transform_doc(self):
header = dict(test1='abc', test2=1, test3=True)
with tempfile.TemporaryDirectory() as site_dir:
Expand Down