Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing of punctuation in format_links() #3027

Merged
merged 7 commits into from
Nov 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 25 additions & 14 deletions bookwyrm/tests/views/test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,21 +420,25 @@ def test_format_links_paragraph_break(self, *_):
'okay\n\n<a href="http://www.fish.com/">www.fish.com/</a>',
)

def test_format_links_parens(self, *_):
"""find and format urls into a tags"""
url = "http://www.fish.com/"
self.assertEqual(
views.status.format_links(f"({url})"),
f'(<a href="{url}">www.fish.com/</a>)',
)

def test_format_links_punctuation(self, *_):
"""don’t take trailing punctuation into account pls"""
url = "http://www.fish.com/"
self.assertEqual(
views.status.format_links(f"{url}."),
f'<a href="{url}">www.fish.com/</a>.',
)
"""test many combinations of brackets, URLs, and punctuation"""
url = "https://bookwyrm.social"
html = f'<a href="{url}">bookwyrm.social</a>'
test_table = [
("punct", f"text and {url}.", f"text and {html}."),
("multi_punct", f"text, then {url}?...", f"text, then {html}?..."),
("bracket_punct", f"here ({url}).", f"here ({html})."),
("punct_bracket", f"there [{url}?]", f"there [{html}?]"),
("punct_bracket_punct", f"not here? ({url}!).", f"not here? ({html}!)."),
(
"multi_punct_bracket",
f"not there ({url}...);",
f"not there ({html}...);",
),
]
for desc, text, output in test_table:
with self.subTest(desc=desc):
self.assertEqual(views.status.format_links(text), output)

def test_format_links_special_chars(self, *_):
"""find and format urls into a tags"""
Expand Down Expand Up @@ -464,6 +468,13 @@ def test_format_links_special_chars(self, *_):
views.status.format_links(url), f'<a href="{url}">{url[8:]}</a>'
)

def test_format_links_ignore_non_urls(self, *_):
"""formating links should leave plain text untouced"""
text_elision = "> “The distinction is significant.” [...]" # bookwyrm#2993
text_quoteparens = "some kind of gene-editing technology (?)" # bookwyrm#3049
self.assertEqual(views.status.format_links(text_elision), text_elision)
self.assertEqual(views.status.format_links(text_quoteparens), text_quoteparens)

def test_format_mentions_with_at_symbol_links(self, *_):
"""A link with an @username shouldn't treat the username as a mention"""
content = "a link to https://example.com/user/@mouse"
Expand Down
69 changes: 27 additions & 42 deletions bookwyrm/views/status.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
""" what are we here for if not for posting """
import re
import logging
from urllib.parse import urlparse

from django.contrib.auth.decorators import login_required
from django.core.validators import URLValidator
Expand Down Expand Up @@ -297,65 +296,51 @@ def find_or_create_hashtags(content):

def format_links(content):
"""detect and format links"""
validator = URLValidator()
formatted_content = ""
validator = URLValidator(["http", "https"])
schema_re = re.compile(r"\bhttps?://")
split_content = re.split(r"(\s+)", content)

for potential_link in split_content:
if not potential_link:
for i, potential_link in enumerate(split_content):
if not schema_re.search(potential_link):
continue
wrapped = _wrapped(potential_link)
if wrapped:
wrapper_close = potential_link[-1]
formatted_content += potential_link[0]
potential_link = potential_link[1:-1]

ends_with_punctuation = _ends_with_punctuation(potential_link)
if ends_with_punctuation:
punctuation_glyph = potential_link[-1]
potential_link = potential_link[0:-1]

# Strip surrounding brackets and trailing punctuation.
prefix, potential_link, suffix = _unwrap(potential_link)
try:
# raises an error on anything that's not a valid link
validator(potential_link)

# use everything but the scheme in the presentation of the link
url = urlparse(potential_link)
link = url.netloc + url.path + url.params
if url.query != "":
link += "?" + url.query
if url.fragment != "":
link += "#" + url.fragment

formatted_content += f'<a href="{potential_link}">{link}</a>'
link = schema_re.sub("", potential_link)
split_content[i] = f'{prefix}<a href="{potential_link}">{link}</a>{suffix}'
except (ValidationError, UnicodeError):
formatted_content += potential_link
pass

if wrapped:
formatted_content += wrapper_close
return "".join(split_content)

if ends_with_punctuation:
formatted_content += punctuation_glyph

return formatted_content
def _unwrap(text):
"""split surrounding brackets and trailing punctuation from a string of text"""
punct = re.compile(r'([.,;:!?"’”»]+)$')
prefix = suffix = ""

if punct.search(text):
# Move punctuation to suffix segment.
text, suffix, _ = punct.split(text)

def _wrapped(text):
"""check if a line of text is wrapped"""
wrappers = [("(", ")"), ("[", "]"), ("{", "}")]
for wrapper in wrappers:
for wrapper in ("()", "[]", "{}"):
if text[0] == wrapper[0] and text[-1] == wrapper[-1]:
return True
return False
# Split out wrapping chars.
suffix = text[-1] + suffix
prefix, text = text[:1], text[1:-1]
break # Nested wrappers not supported atm.

if punct.search(text):
# Move inner punctuation to suffix segment.
text, inner_punct, _ = punct.split(text)
suffix = inner_punct + suffix

def _ends_with_punctuation(text):
"""check if a line of text ends with a punctuation glyph"""
glyphs = [".", ",", ";", ":", "!", "?", "”", "’", '"', "»"]
for glyph in glyphs:
if text[-1] == glyph:
return True
return False
return prefix, text, suffix


def to_markdown(content):
Expand Down