Skip to content

Commit

Permalink
use pyparsing instead of bbcode
Browse files Browse the repository at this point in the history
  • Loading branch information
marph91 committed Sep 26, 2024
1 parent 0ec8295 commit 806a99a
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 97 deletions.
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ extra_checks = true
disable_error_code = "attr-defined"
[[tool.mypy.overrides]]
module = [
"bbcode",
"enlighten",
"frontmatter",
"puremagic",
Expand Down
1 change: 0 additions & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
bbcode==1.1.0
beautifulsoup4==4.12.3
enlighten==1.12.4
markdown==3.7
Expand Down
2 changes: 1 addition & 1 deletion src/formats/tiddlywiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def wikitext_to_md(wikitext: str) -> str:
| list_()
)
# TODO: Why does "table" overwrite other rules when executes in the same run?
wikitext_complex =(
wikitext_complex = (
# block quote:
# https://tiddlywiki.com/static/Block%2520Quotes%2520in%2520WikiText.html
multiline_quote()
Expand Down
279 changes: 185 additions & 94 deletions src/formats/zettelkasten.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,109 +2,177 @@

import datetime as dt
from pathlib import Path
import re
import xml.etree.ElementTree as ET # noqa: N817

import bbcode
import pyparsing as pp

import common
import converter
import intermediate_format as imf


def bbcode_to_markdown(
bbcode_str: str,
) -> tuple[str, list[imf.NoteLink], list[imf.Resource]]:
# pylint: disable=unused-argument
note_links = []
images = []

parser = bbcode.Parser()
parser.add_simple_formatter("h1", "# %(value)s")
parser.add_simple_formatter("h2", "## %(value)s")
parser.add_simple_formatter("br", "\n", standalone=True)
parser.add_simple_formatter("q", "> %(value)s")
parser.add_simple_formatter("code", "\n```\n%(value)s\n```")

# left, right aligned, centered, justified - not supported
parser.add_simple_formatter("al", "%(value)s")
parser.add_simple_formatter("ar", "%(value)s")
parser.add_simple_formatter("c", "%(value)s")
parser.add_simple_formatter("ab", "%(value)s")

# text formatting
parser.add_simple_formatter("f", "**%(value)s**")
parser.add_simple_formatter("k", "*%(value)s*")
parser.add_simple_formatter("u", "++%(value)s++")
parser.add_simple_formatter("d", "~~%(value)s~~")
parser.add_simple_formatter("qm", '"%(value)s"')
parser.add_simple_formatter("sub", "~%(value)s~")
parser.add_simple_formatter("sup", "^%(value)s^")
# Prevent spaces, tabs and newlines from being stripped.
pp.ParserElement.set_default_whitespace_chars("")


colored_re = re.compile(r"\[h .*\](.*?)\[\/h\]")
internal_link_re = re.compile(r"\[z (\d+)\](.*?)\[\/z\]")
table_re = re.compile(r"\[table\](\[tc\](.*?)\[\/tc\])?([\S\s]*?)\[\/table\]")
# hacky, but works for now
list_re = re.compile(r"\[([ln])\]\[\*\](.*?)\[\/\*\]\[\/[ln]\]")


def tag(source_tag, target_tag, replace_first_only=False):
"""Conversion of a quoted string. I. e. with the same start and end tags."""

def to_md(_, t): # noqa
if replace_first_only:
return target_tag + t[0]
return target_tag + t[0] + target_tag

return pp.QuotedString(
f"[{source_tag}]", end_quote_char=f"[/{source_tag}]"
).set_parse_action(to_md)


def newline():
return pp.Literal("[br]").set_parse_action(lambda: "\n")


def colored():
# colored -> bold
parser.add_simple_formatter("h", "**%(value)s**")

# forms
def _render_form(name, value, options, parent, context):
return " ".join([f"`{key}={value}`" for key, value in options.items()])

parser.add_formatter("form", _render_form, standalone=True)

# lists
def _render_list_item(name, value, options, parent, context):
match parent.tag_name:
case "l":
return f"* {value}\n"
case "n":
return f"1. {value}\n"
case _:
return value

parser.add_simple_formatter("l", "%(value)s")
parser.add_simple_formatter("n", "%(value)s")
parser.add_formatter("*", _render_list_item)

# images and internal note links
def _render_image(name, value, options, parent, context):
text = f"![]({value})"
images.append(imf.Resource(Path(value), text))
return text

parser.add_formatter("img", _render_image)

def _render_internal_link(name, value, options, parent, context):
id_ = list(options)[0]
text = f"[{value}]({id_})"
note_links.append(imf.NoteLink(text, id_, value))
return text

parser.add_formatter("z", _render_internal_link)

# tables
def _render_table(name, value, options, parent, context):
def to_md(_, t): # noqa
return "**" + t[0][0] + "**"

return pp.Regex(colored_re, as_group_list=True).set_parse_action(to_md)


def code_block():
def to_md(_, t): # noqa
return f"\n```\n{t[0]}\n```"

return pp.QuotedString(
"[code]", end_quote_char="[/code]", multiline=True
).set_parse_action(to_md)


def list_():
def to_md(_, t): # noqa
type_, content = t[0]
list_character = {"l": "*", "n": "1."}[type_]
return (
f"{list_character} "
+ f"\n{list_character} ".join(content.split("[/*][*]"))
+ "\n"
)

return pp.Regex(list_re, as_group_list=True).set_parse_action(to_md)


def image():
def to_md(_, t): # noqa
return f"![{t[0]}]({t[0]})"

return pp.QuotedString("[img]", end_quote_char="[/img]").set_parse_action(to_md)


def internal_link():
def to_md(_, t): # noqa
id_, title = t[0]
return f"[{title}](note://{id_})"

return pp.Regex(internal_link_re, as_group_list=True).set_parse_action(to_md)


def table():
def to_md(_, t): # noqa
_, caption, content = t[0]

table_md = common.MarkdownTable()
for line in value.split("\n"):
if caption is not None:
table_md.caption = caption

for line in content.split("\n"):
if not line.strip():
continue
if "^" in line:
table_md.header_rows.append(line.split("^"))
elif "|" in line:
table_md.data_rows.append(line.split("|"))
else:
table_md.caption += line
table_md.data_rows.append(line.split("|"))
return table_md.create_md()

parser.add_formatter("table", _render_table)
parser.add_simple_formatter("tc", "%(value)s\n")

markdown = parser.format(
bbcode_str,
install_defaults=False,
escape_html=False,
newline="\n",
replace_cosmetic=False,
replace_links=False,
return pp.Regex(table_re, as_group_list=True).set_parse_action(to_md)


def bbcode_to_md(wikitext: str) -> str:
r"""
Main bbcode to Markdown conversion function.
# hyperlinks are markdown already
>>> bbcode_to_md("[f]fett[/f]")
'**fett**'
>>> bbcode_to_md("das ist [d]durchgestrichener[/d] text")
'das ist ~~durchgestrichener~~ text'
>>> bbcode_to_md("[h #ffff00]colored[/h] text")
'**colored** text'
>>> bbcode_to_md("[h3]heading 3[/h3]")
'### heading 3'
>>> bbcode_to_md("some[br]li nes[br]he re")
'some\nli nes\nhe re'
>>> bbcode_to_md("[q]single line quote[/q]")
'> single line quote'
>>> bbcode_to_md("disappearing [al]tag[/al]")
'disappearing tag'
>>> bbcode_to_md("[code]some code[/code]")
'\n```\nsome code\n```'
>>> bbcode_to_md("[code]long[br]code block[/code]")
'\n```\nlong\ncode block\n```'
>>> bbcode_to_md("[img]some image.png[/img]")
'![some image.png](some image.png)'
>>> bbcode_to_md("link [z 3]zu Zettel 3[/z]")
'link [zu Zettel 3](note://3)'
>>> bbcode_to_md("[table][tc]Test Table[/tc][br]h 1^h 2^h3[br]d1 |d 2 |d3[/table]")
'Test Table\n\n| h 1 | h 2 | h3 |\n| --- | --- | --- |\n| d1 | d 2 | d3 |\n'
>>> bbcode_to_md("[table]h 1^h 2^h3[br]d1 |d 2 |d3[/table]")
'| h 1 | h 2 | h3 |\n| --- | --- | --- |\n| d1 | d 2 | d3 |\n'
>>> bbcode_to_md("[l][*]Here an item[/*][*]Other item![/*][/l]")
'* Here an item\n* Other item!\n'
>>> bbcode_to_md("[n][*]Numbered item[/*][*]Other numbered item![/*][/n]")
'1. Numbered item\n1. Other numbered item!\n'
"""
bbcode_markup = (
newline()
# text formatting
| tag("f", "**")
| tag("k", "*")
| tag("u", "++")
| tag("d", "~~")
| tag("qm", '"')
| tag("sub", "~")
| tag("sup", "^")
| colored()
# left, right aligned, centered, justified - not supported
| tag("al", "")
| tag("ar", "")
| tag("c", "")
| tag("ab", "")
#
| tag("h1", "# ", replace_first_only=True)
| tag("h2", "## ", replace_first_only=True)
| tag("h3", "### ", replace_first_only=True)
| tag("h4", "#### ", replace_first_only=True)
| tag("h5", "##### ", replace_first_only=True)
| tag("h6", "###### ", replace_first_only=True)
| tag("q", "> ", replace_first_only=True)
| image()
| internal_link()
| list_()
)
return markdown, note_links, images
# TODO: Why is a second pass needed?
bbcode_complex = code_block() | table()
return bbcode_complex.transform_string(bbcode_markup.transform_string(wikitext))


class Converter(converter.BaseConverter):
Expand All @@ -128,6 +196,27 @@ def parse_attributes(self, zettel, note_imf: imf.Note):
case _:
self.logger.warning(f"ignoring attribute {key}={value}")

def handle_markdown_links(self, body, source_folder) -> tuple[list, list]:
note_links = []
resources = []
for link in common.get_markdown_links(body):
if link.is_web_link or link.is_mail_link:
continue # keep the original links
if link.url.startswith("note://"):
original_id = link.url.replace("note://", "")
note_links.append(imf.NoteLink(str(link), original_id, link.text))
elif link.is_image:
resources.append(
imf.Resource(source_folder / "img" / link.url, str(link), link.text)
)
else:
resources.append(
imf.Resource(
source_folder / "attachments" / link.url, str(link), link.text
)
)
return resources, note_links

def convert(self, file_or_folder: Path):
# TODO
# pylint: disable=too-many-branches,too-many-locals
Expand Down Expand Up @@ -169,19 +258,21 @@ def convert(self, file_or_folder: Path):
case "title":
pass # handled already
case "content":
body, note_links, images = bbcode_to_markdown(
item.text if item.text else ""
)
body = bbcode_to_md(item.text if item.text else "")
note_imf.body = body
resources, note_links = self.handle_markdown_links(
body, file_or_folder.parent
)
note_imf.resources.extend(resources)
note_imf.note_links.extend(note_links)

if images_available:
for image in images:
image.filename = images_folder / image.filename
# Set manually, because with invalid path it's
# set to False.
image.is_image = True
note_imf.resources.append(image)
# if images_available:
# for image in images:
# image.filename = images_folder / image.filename
# # Set manually, because with invalid path it's
# # set to False.
# image.is_image = True
# note_imf.resources.extend(resources)
case "author":
note_imf.author = item.text
case "keywords":
Expand Down

0 comments on commit 806a99a

Please sign in to comment.