use pyparsing instead of bbcode

marph91 · Sep 26, 2024 · 806a99a · 806a99a
1 parent 0ec8295
commit 806a99a
Show file tree

Hide file tree

Showing 4 changed files with 186 additions and 97 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,6 @@ extra_checks = true
 disable_error_code = "attr-defined"
 [[tool.mypy.overrides]]
 module = [
-    "bbcode",
     "enlighten",
     "frontmatter",
     "puremagic",

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -1,4 +1,3 @@
-bbcode==1.1.0
 beautifulsoup4==4.12.3
 enlighten==1.12.4
 markdown==3.7

diff --git a/src/formats/tiddlywiki.py b/src/formats/tiddlywiki.py
@@ -267,7 +267,7 @@ def wikitext_to_md(wikitext: str) -> str:
         | list_()
     )
     # TODO: Why does "table" overwrite other rules when executes in the same run?
-    wikitext_complex =(
+    wikitext_complex = (
         # block quote:
         # https://tiddlywiki.com/static/Block%2520Quotes%2520in%2520WikiText.html
         multiline_quote()

diff --git a/src/formats/zettelkasten.py b/src/formats/zettelkasten.py
@@ -2,109 +2,177 @@
 
 import datetime as dt
 from pathlib import Path
+import re
 import xml.etree.ElementTree as ET  # noqa: N817
 
-import bbcode
+import pyparsing as pp
 
 import common
 import converter
 import intermediate_format as imf
 
 
-def bbcode_to_markdown(
-    bbcode_str: str,
-) -> tuple[str, list[imf.NoteLink], list[imf.Resource]]:
-    # pylint: disable=unused-argument
-    note_links = []
-    images = []
-
-    parser = bbcode.Parser()
-    parser.add_simple_formatter("h1", "# %(value)s")
-    parser.add_simple_formatter("h2", "## %(value)s")
-    parser.add_simple_formatter("br", "\n", standalone=True)
-    parser.add_simple_formatter("q", "> %(value)s")
-    parser.add_simple_formatter("code", "\n```\n%(value)s\n```")
-
-    # left, right aligned, centered, justified - not supported
-    parser.add_simple_formatter("al", "%(value)s")
-    parser.add_simple_formatter("ar", "%(value)s")
-    parser.add_simple_formatter("c", "%(value)s")
-    parser.add_simple_formatter("ab", "%(value)s")
-
-    # text formatting
-    parser.add_simple_formatter("f", "**%(value)s**")
-    parser.add_simple_formatter("k", "*%(value)s*")
-    parser.add_simple_formatter("u", "++%(value)s++")
-    parser.add_simple_formatter("d", "~~%(value)s~~")
-    parser.add_simple_formatter("qm", '"%(value)s"')
-    parser.add_simple_formatter("sub", "~%(value)s~")
-    parser.add_simple_formatter("sup", "^%(value)s^")
+# Prevent spaces, tabs and newlines from being stripped.
+pp.ParserElement.set_default_whitespace_chars("")
 
+
+colored_re = re.compile(r"\[h .*\](.*?)\[\/h\]")
+internal_link_re = re.compile(r"\[z (\d+)\](.*?)\[\/z\]")
+table_re = re.compile(r"\[table\](\[tc\](.*?)\[\/tc\])?([\S\s]*?)\[\/table\]")
+# hacky, but works for now
+list_re = re.compile(r"\[([ln])\]\[\*\](.*?)\[\/\*\]\[\/[ln]\]")
+
+
+def tag(source_tag, target_tag, replace_first_only=False):
+    """Conversion of a quoted string. I. e. with the same start and end tags."""
+
+    def to_md(_, t):  # noqa
+        if replace_first_only:
+            return target_tag + t[0]
+        return target_tag + t[0] + target_tag
+
+    return pp.QuotedString(
+        f"[{source_tag}]", end_quote_char=f"[/{source_tag}]"
+    ).set_parse_action(to_md)
+
+
+def newline():
+    return pp.Literal("[br]").set_parse_action(lambda: "\n")
+
+
+def colored():
     # colored -> bold
-    parser.add_simple_formatter("h", "**%(value)s**")
-
-    # forms
-    def _render_form(name, value, options, parent, context):
-        return " ".join([f"`{key}={value}`" for key, value in options.items()])
-
-    parser.add_formatter("form", _render_form, standalone=True)
-
-    # lists
-    def _render_list_item(name, value, options, parent, context):
-        match parent.tag_name:
-            case "l":
-                return f"* {value}\n"
-            case "n":
-                return f"1. {value}\n"
-            case _:
-                return value
-
-    parser.add_simple_formatter("l", "%(value)s")
-    parser.add_simple_formatter("n", "%(value)s")
-    parser.add_formatter("*", _render_list_item)
-
-    # images and internal note links
-    def _render_image(name, value, options, parent, context):
-        text = f"![]({value})"
-        images.append(imf.Resource(Path(value), text))
-        return text
-
-    parser.add_formatter("img", _render_image)
-
-    def _render_internal_link(name, value, options, parent, context):
-        id_ = list(options)[0]
-        text = f"[{value}]({id_})"
-        note_links.append(imf.NoteLink(text, id_, value))
-        return text
-
-    parser.add_formatter("z", _render_internal_link)
-
-    # tables
-    def _render_table(name, value, options, parent, context):
+    def to_md(_, t):  # noqa
+        return "**" + t[0][0] + "**"
+
+    return pp.Regex(colored_re, as_group_list=True).set_parse_action(to_md)
+
+
+def code_block():
+    def to_md(_, t):  # noqa
+        return f"\n```\n{t[0]}\n```"
+
+    return pp.QuotedString(
+        "[code]", end_quote_char="[/code]", multiline=True
+    ).set_parse_action(to_md)
+
+
+def list_():
+    def to_md(_, t):  # noqa
+        type_, content = t[0]
+        list_character = {"l": "*", "n": "1."}[type_]
+        return (
+            f"{list_character} "
+            + f"\n{list_character} ".join(content.split("[/*][*]"))
+            + "\n"
+        )
+
+    return pp.Regex(list_re, as_group_list=True).set_parse_action(to_md)
+
+
+def image():
+    def to_md(_, t):  # noqa
+        return f"![{t[0]}]({t[0]})"
+
+    return pp.QuotedString("[img]", end_quote_char="[/img]").set_parse_action(to_md)
+
+
+def internal_link():
+    def to_md(_, t):  # noqa
+        id_, title = t[0]
+        return f"[{title}](note://{id_})"
+
+    return pp.Regex(internal_link_re, as_group_list=True).set_parse_action(to_md)
+
+
+def table():
+    def to_md(_, t):  # noqa
+        _, caption, content = t[0]
+
         table_md = common.MarkdownTable()
-        for line in value.split("\n"):
+        if caption is not None:
+            table_md.caption = caption
+
+        for line in content.split("\n"):
             if not line.strip():
                 continue
             if "^" in line:
                 table_md.header_rows.append(line.split("^"))
-            elif "|" in line:
-                table_md.data_rows.append(line.split("|"))
             else:
-                table_md.caption += line
+                table_md.data_rows.append(line.split("|"))
         return table_md.create_md()
 
-    parser.add_formatter("table", _render_table)
-    parser.add_simple_formatter("tc", "%(value)s\n")
-
-    markdown = parser.format(
-        bbcode_str,
-        install_defaults=False,
-        escape_html=False,
-        newline="\n",
-        replace_cosmetic=False,
-        replace_links=False,
+    return pp.Regex(table_re, as_group_list=True).set_parse_action(to_md)
+
+
+def bbcode_to_md(wikitext: str) -> str:
+    r"""
+    Main bbcode to Markdown conversion function.
+
+    # hyperlinks are markdown already
+
+    >>> bbcode_to_md("[f]fett[/f]")
+    '**fett**'
+    >>> bbcode_to_md("das ist [d]durchgestrichener[/d] text")
+    'das ist ~~durchgestrichener~~ text'
+    >>> bbcode_to_md("[h #ffff00]colored[/h] text")
+    '**colored** text'
+    >>> bbcode_to_md("[h3]heading 3[/h3]")
+    '### heading 3'
+    >>> bbcode_to_md("some[br]li nes[br]he re")
+    'some\nli nes\nhe re'
+    >>> bbcode_to_md("[q]single line quote[/q]")
+    '> single line quote'
+    >>> bbcode_to_md("disappearing [al]tag[/al]")
+    'disappearing tag'
+    >>> bbcode_to_md("[code]some code[/code]")
+    '\n```\nsome code\n```'
+    >>> bbcode_to_md("[code]long[br]code block[/code]")
+    '\n```\nlong\ncode block\n```'
+    >>> bbcode_to_md("[img]some image.png[/img]")
+    '![some image.png](some image.png)'
+    >>> bbcode_to_md("link [z 3]zu Zettel 3[/z]")
+    'link [zu Zettel 3](note://3)'
+    >>> bbcode_to_md("[table][tc]Test Table[/tc][br]h 1^h 2^h3[br]d1 |d 2 |d3[/table]")
+    'Test Table\n\n| h 1 | h 2 | h3 |\n| --- | --- | --- |\n| d1  | d 2  | d3 |\n'
+    >>> bbcode_to_md("[table]h 1^h 2^h3[br]d1 |d 2 |d3[/table]")
+    '| h 1 | h 2 | h3 |\n| --- | --- | --- |\n| d1  | d 2  | d3 |\n'
+    >>> bbcode_to_md("[l][*]Here an item[/*][*]Other item![/*][/l]")
+    '* Here an item\n* Other item!\n'
+    >>> bbcode_to_md("[n][*]Numbered item[/*][*]Other numbered item![/*][/n]")
+    '1. Numbered item\n1. Other numbered item!\n'
+    """
+    bbcode_markup = (
+        newline()
+        # text formatting
+        | tag("f", "**")
+        | tag("k", "*")
+        | tag("u", "++")
+        | tag("d", "~~")
+        | tag("qm", '"')
+        | tag("sub", "~")
+        | tag("sup", "^")
+        | colored()
+        # left, right aligned, centered, justified - not supported
+        | tag("al", "")
+        | tag("ar", "")
+        | tag("c", "")
+        | tag("ab", "")
+        #
+        | tag("h1", "# ", replace_first_only=True)
+        | tag("h2", "## ", replace_first_only=True)
+        | tag("h3", "### ", replace_first_only=True)
+        | tag("h4", "#### ", replace_first_only=True)
+        | tag("h5", "##### ", replace_first_only=True)
+        | tag("h6", "###### ", replace_first_only=True)
+        | tag("q", "> ", replace_first_only=True)
+        | image()
+        | internal_link()
+        | list_()
     )
-    return markdown, note_links, images
+    # TODO: Why is a second pass needed?
+    bbcode_complex = code_block() | table()
+    return bbcode_complex.transform_string(bbcode_markup.transform_string(wikitext))
 
 
 class Converter(converter.BaseConverter):
@@ -128,6 +196,27 @@ def parse_attributes(self, zettel, note_imf: imf.Note):
                 case _:
                     self.logger.warning(f"ignoring attribute {key}={value}")
 
+    def handle_markdown_links(self, body, source_folder) -> tuple[list, list]:
+        note_links = []
+        resources = []
+        for link in common.get_markdown_links(body):
+            if link.is_web_link or link.is_mail_link:
+                continue  # keep the original links
+            if link.url.startswith("note://"):
+                original_id = link.url.replace("note://", "")
+                note_links.append(imf.NoteLink(str(link), original_id, link.text))
+            elif link.is_image:
+                resources.append(
+                    imf.Resource(source_folder / "img" / link.url, str(link), link.text)
+                )
+            else:
+                resources.append(
+                    imf.Resource(
+                        source_folder / "attachments" / link.url, str(link), link.text
+                    )
+                )
+        return resources, note_links
+
     def convert(self, file_or_folder: Path):
         # TODO
         # pylint: disable=too-many-branches,too-many-locals
@@ -169,19 +258,21 @@ def convert(self, file_or_folder: Path):
                     case "title":
                         pass  # handled already
                     case "content":
-                        body, note_links, images = bbcode_to_markdown(
-                            item.text if item.text else ""
-                        )
+                        body = bbcode_to_md(item.text if item.text else "")
                         note_imf.body = body
+                        resources, note_links = self.handle_markdown_links(
+                            body, file_or_folder.parent
+                        )
+                        note_imf.resources.extend(resources)
                         note_imf.note_links.extend(note_links)
 
-                        if images_available:
-                            for image in images:
-                                image.filename = images_folder / image.filename
-                                # Set manually, because with invalid path it's
-                                # set to False.
-                                image.is_image = True
-                                note_imf.resources.append(image)
+                        # if images_available:
+                        #     for image in images:
+                        #         image.filename = images_folder / image.filename
+                        #         # Set manually, because with invalid path it's
+                        #         # set to False.
+                        #         image.is_image = True
+                        #         note_imf.resources.extend(resources)
                     case "author":
                         note_imf.author = item.text
                     case "keywords":