-
Notifications
You must be signed in to change notification settings - Fork 264
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix markdown block quote truncation (#1457)
Fix markdown block quote truncation
- Loading branch information
Robert Szefler
authored
Jul 9, 2024
1 parent
fcc8658
commit d6d7230
Showing
6 changed files
with
394 additions
and
231 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
try: | ||
from itertools import batched | ||
except ImportError: # Python < 3.12 | ||
from more_itertools import batched | ||
|
||
import regex | ||
|
||
|
||
def trim_markdown(text: str, max_length: int, suffix: str = "...") -> str: | ||
# This method of trimming markdown is not universal. It only takes care of correctly | ||
# trimming block sections. Implementing a general truncation method for markdown that | ||
# would handle all the possible tags in a correct way would be rather complex. | ||
|
||
trim_idx = max_length - len(suffix) | ||
|
||
if trim_idx <= 0: # The pathological cases. | ||
return suffix[:max_length] | ||
|
||
# Process block quotes backwards in the input | ||
for match_open, match_close in batched(regex.finditer("```", text, regex.REVERSE), 2): | ||
open_start, open_end = match_close.span() | ||
close_start, close_end = match_open.span() | ||
if trim_idx >= close_end: | ||
# Trimming point after this block quote | ||
return text[:trim_idx] + suffix | ||
elif trim_idx < open_start: | ||
# Trimming point before this block quote - continue to the preceding block | ||
continue | ||
elif trim_idx >= open_start and trim_idx < open_start + 3: | ||
# Trimming point inside the opening block quote tag | ||
return text[:trim_idx].rstrip("`") + suffix | ||
elif trim_idx >= close_start and trim_idx < close_end: | ||
# Trimming point inside the closing block quote tag | ||
if trim_idx - open_end >= 3: # Enough space to insert the closing tag | ||
return text[:trim_idx - 3] + "```" + suffix | ||
else: # Not enough space, strip the whole block | ||
return text[:open_start] + suffix | ||
elif trim_idx >= open_end and trim_idx < close_start: | ||
# Trimming point inside the block quote | ||
if trim_idx - open_end >= 3: # Enough space to insert the closing tag | ||
return text[:trim_idx - 3] + "```" + suffix | ||
else: # Not enough space, strip the whole block | ||
return text[:open_start] + suffix | ||
else: | ||
# This should never happen | ||
raise Exception( | ||
f'Internal error in trim_markdown, text="{text[:12]}"(...), {max_length=}, suffix="{suffix}", ' | ||
f'matched code block {open_start}..{close_end}' | ||
) | ||
|
||
# Cases when there were no code blocks in the input | ||
if len(text) <= trim_idx: | ||
return text | ||
elif len(text) < max_length: | ||
return (text[:trim_idx] + suffix)[:max_length] | ||
else: | ||
return text[:trim_idx] + suffix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import pytest | ||
|
||
from robusta.utils.trim_markdown import trim_markdown | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"max_length,expected_output", [ | ||
(0, ""), | ||
(1, "#"), | ||
(2, "##"), | ||
(3, "##"), | ||
(4, "##"), | ||
(5, "##"), | ||
(6, "##"), | ||
(7, "##"), | ||
(8, "``````##"), | ||
(9, "```o```##"), | ||
(10, "```oh```##"), | ||
(13, "```oh``` he##"), | ||
(16, "```oh``` hello##"), | ||
(17, "```oh``` hello ##"), | ||
(18, "```oh``` hello ##"), | ||
(19, "```oh``` hello ##"), | ||
(20, "```oh``` hello ##"), | ||
(21, "```oh``` hello ##"), | ||
(22, "```oh``` hello ##"), | ||
(23, "```oh``` hello ``````##"), | ||
(24, "```oh``` hello ```w```##"), | ||
(25, "```oh``` hello ```wo```##"), | ||
(27, "```oh``` hello ```worl```##"), | ||
(28, "```oh``` hello ```world```##"), | ||
(29, "```oh``` hello ```world``` ##"), | ||
(31, "```oh``` hello ```world``` an##"), | ||
(39, "```oh``` hello ```world``` and then ##"), | ||
(42, "```oh``` hello ```world``` and then ##"), | ||
(44, "```oh``` hello ```world``` and then ``````##"), | ||
(48, "```oh``` hello ```world``` and then ```some```##"), | ||
(52, "```oh``` hello ```world``` and then ```somethin```##"), | ||
(53, "```oh``` hello ```world``` and then ```something```##"), | ||
(54, "```oh``` hello ```world``` and then ```something```##"), | ||
(111, "```oh``` hello ```world``` and then ```something```##"), | ||
]) | ||
def test_trim_markdown(max_length: int, expected_output: str): | ||
text = "```oh``` hello ```world``` and then ```something```" | ||
trimmed = trim_markdown(text, max_length, "##") | ||
assert trimmed == expected_output | ||
assert len(trimmed) <= max_length | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"max_length,expected_output", [ | ||
(0, ""), | ||
(1, "$"), | ||
(2, "$$"), | ||
(3, "$$$"), | ||
(4, "N$$$"), | ||
(5, "No$$$"), | ||
(10, "No code$$$"), | ||
(38, "No code blocks whatsoever in this t$$$"), | ||
(39, "No code blocks whatsoever in this te$$$"), | ||
(40, "No code blocks whatsoever in this tex$$$"), | ||
(41, "No code blocks whatsoever in this text"), | ||
(42, "No code blocks whatsoever in this text"), | ||
(111, "No code blocks whatsoever in this text"), | ||
] | ||
) | ||
def test_trim_markdown_no_code_blocks(max_length: int, expected_output: str): | ||
text = "No code blocks whatsoever in this text" | ||
trimmed = trim_markdown(text, max_length, "$$$") | ||
assert trimmed == expected_output | ||
assert len(trimmed) <= max_length |