Fix markdown block quote truncation (#1457)

Fix markdown block quote truncation
robusta-dev · Jul 9, 2024 · d6d7230 · d6d7230
1 parent fcc8658
commit d6d7230
Show file tree

Hide file tree

Showing 6 changed files with 394 additions and 231 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,8 @@ boto3 = "1.28.72"
 prometheus-api-client = "0.5.4"
 requests = "^2.32.3"
 certifi = "^2023.7.22"
+regex = "2024.5.15"
+more_itertools = { version = "^10.3", python = "<3.12" }
 
 [tool.poetry.dev-dependencies]
 pre-commit = "^2.13.0"

diff --git a/src/robusta/core/sinks/transformer.py b/src/robusta/core/sinks/transformer.py
@@ -1,7 +1,6 @@
 import logging
 import re
 import urllib.parse
-from collections import defaultdict
 from typing import List, Optional, Union
 
 import markdown2
@@ -28,6 +27,7 @@ def tabulate(*args, **kwargs):
     ScanReportBlock,
     TableBlock,
 )
+from robusta.utils.trim_markdown import trim_markdown
 
 
 class Transformer:
@@ -58,6 +58,13 @@ def apply_length_limit(msg: str, max_length: int, truncator: Optional[str] = Non
         truncator = truncator or "..."
         return msg[: max_length - len(truncator)] + truncator
 
+    @staticmethod
+    def apply_length_limit_to_markdown(msg: str, max_length: int, truncator: str = "...") -> str:
+        try:
+            return trim_markdown(msg, max_length, truncator)
+        except:
+            return Transformer.apply_length_limit(msg, max_length, truncator)
+
     @staticmethod
     def to_markdown_diff(block: KubernetesDiffBlock, use_emoji_sign: bool = False) -> List[ListBlock]:
         # this can happen when a block.old=None or block.new=None - e.g. the resource was added or deleted

diff --git a/src/robusta/integrations/slack/sender.py b/src/robusta/integrations/slack/sender.py
@@ -146,7 +146,7 @@ def __to_slack_markdown(self, block: MarkdownBlock) -> List[SlackBlock]:
                 "type": "section",
                 "text": {
                     "type": "mrkdwn",
-                    "text": Transformer.apply_length_limit(block.text, MAX_BLOCK_CHARS),
+                    "text": Transformer.apply_length_limit_to_markdown(block.text, MAX_BLOCK_CHARS),
                 },
             }
         ]

diff --git a/src/robusta/utils/trim_markdown.py b/src/robusta/utils/trim_markdown.py
@@ -0,0 +1,57 @@
+try:
+    from itertools import batched
+except ImportError:  # Python < 3.12
+    from more_itertools import batched
+
+import regex
+
+
+def trim_markdown(text: str, max_length: int, suffix: str = "...") -> str:
+    # This method of trimming markdown is not universal. It only takes care of correctly
+    # trimming block sections. Implementing a general truncation method for markdown that
+    # would handle all the possible tags in a correct way would be rather complex.
+
+    trim_idx = max_length - len(suffix)
+
+    if trim_idx <= 0:  # The pathological cases.
+        return suffix[:max_length]
+
+    # Process block quotes backwards in the input
+    for match_open, match_close in batched(regex.finditer("```", text, regex.REVERSE), 2):
+        open_start, open_end = match_close.span()
+        close_start, close_end = match_open.span()
+        if trim_idx >= close_end:
+            # Trimming point after this block quote
+            return text[:trim_idx] + suffix
+        elif trim_idx < open_start:
+            # Trimming point before this block quote - continue to the preceding block
+            continue
+        elif trim_idx >= open_start and trim_idx < open_start + 3:
+            # Trimming point inside the opening block quote tag
+            return text[:trim_idx].rstrip("`") + suffix
+        elif trim_idx >= close_start and trim_idx < close_end:
+            # Trimming point inside the closing block quote tag
+            if trim_idx - open_end >= 3:  # Enough space to insert the closing tag
+                return text[:trim_idx - 3] + "```" + suffix
+            else:  # Not enough space, strip the whole block
+                return text[:open_start] + suffix
+        elif trim_idx >= open_end and trim_idx < close_start:
+            # Trimming point inside the block quote
+            if trim_idx - open_end >= 3:  # Enough space to insert the closing tag
+                return text[:trim_idx - 3] + "```" + suffix
+            else:  # Not enough space, strip the whole block
+                return text[:open_start] + suffix
+        else:
+            # This should never happen
+            raise Exception(
+                f'Internal error in trim_markdown, text="{text[:12]}"(...), {max_length=}, suffix="{suffix}", '
+                f'matched code block {open_start}..{close_end}'
+            )
+
+    # Cases when there were no code blocks in the input
+    if len(text) <= trim_idx:
+        return text
+    elif len(text) < max_length:
+        return (text[:trim_idx] + suffix)[:max_length]
+    else:
+        return text[:trim_idx] + suffix
diff --git a/tests/test_trim_markdown.py b/tests/test_trim_markdown.py
@@ -0,0 +1,71 @@
+import pytest
+
+from robusta.utils.trim_markdown import trim_markdown
+
+
+@pytest.mark.parametrize(
+    "max_length,expected_output", [
+        (0, ""),
+        (1, "#"),
+        (2, "##"),
+        (3, "##"),
+        (4, "##"),
+        (5, "##"),
+        (6, "##"),
+        (7, "##"),
+        (8, "``````##"),
+        (9, "```o```##"),
+        (10, "```oh```##"),
+        (13, "```oh``` he##"),
+        (16, "```oh``` hello##"),
+        (17, "```oh``` hello ##"),
+        (18, "```oh``` hello ##"),
+        (19, "```oh``` hello ##"),
+        (20, "```oh``` hello ##"),
+        (21, "```oh``` hello ##"),
+        (22, "```oh``` hello ##"),
+        (23, "```oh``` hello ``````##"),
+        (24, "```oh``` hello ```w```##"),
+        (25, "```oh``` hello ```wo```##"),
+        (27, "```oh``` hello ```worl```##"),
+        (28, "```oh``` hello ```world```##"),
+        (29, "```oh``` hello ```world``` ##"),
+        (31, "```oh``` hello ```world``` an##"),
+        (39, "```oh``` hello ```world``` and then ##"),
+        (42, "```oh``` hello ```world``` and then ##"),
+        (44, "```oh``` hello ```world``` and then ``````##"),
+        (48, "```oh``` hello ```world``` and then ```some```##"),
+        (52, "```oh``` hello ```world``` and then ```somethin```##"),
+        (53, "```oh``` hello ```world``` and then ```something```##"),
+        (54, "```oh``` hello ```world``` and then ```something```##"),
+        (111, "```oh``` hello ```world``` and then ```something```##"),
+    ])
+def test_trim_markdown(max_length: int, expected_output: str):
+    text = "```oh``` hello ```world``` and then ```something```"
+    trimmed = trim_markdown(text, max_length, "##")
+    assert trimmed == expected_output
+    assert len(trimmed) <= max_length
+
+
+@pytest.mark.parametrize(
+    "max_length,expected_output", [
+        (0, ""),
+        (1, "$"),
+        (2, "$$"),
+        (3, "$$$"),
+        (4, "N$$$"),
+        (5, "No$$$"),
+        (10, "No code$$$"),
+        (38, "No code blocks whatsoever in this t$$$"),
+        (39, "No code blocks whatsoever in this te$$$"),
+        (40, "No code blocks whatsoever in this tex$$$"),
+        (41, "No code blocks whatsoever in this text"),
+        (42, "No code blocks whatsoever in this text"),
+        (111, "No code blocks whatsoever in this text"),
+    ]
+)
+def test_trim_markdown_no_code_blocks(max_length: int, expected_output: str):
+    text = "No code blocks whatsoever in this text"
+    trimmed = trim_markdown(text, max_length, "$$$")
+    assert trimmed == expected_output
+    assert len(trimmed) <= max_length