From 351bfae4c1a3c5893f94e27d8b115322eeb2c891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20D=C3=B6rfelt?= Date: Sun, 6 Oct 2024 15:00:03 +0200 Subject: [PATCH] add support for zim --- docs/contributing/more_note_apps.md | 1 - docs/formats/zim.md | 25 +++++++ docs/index.md | 19 ++--- mkdocs.yml | 1 + src/formats/zim.py | 105 +++++++++++++++++++++++++++ src/importer.py | 4 +- src/intermediate_format.py | 2 + src/markdown_lib/common.py | 23 ++++++ src/markdown_lib/tiddlywiki.py | 27 ++----- src/markdown_lib/zim.py | 108 ++++++++++++++++++++++++++++ test/data | 2 +- test/test_convert.py | 1 + 12 files changed, 285 insertions(+), 33 deletions(-) create mode 100644 docs/formats/zim.md create mode 100644 src/formats/zim.py create mode 100644 src/markdown_lib/zim.py diff --git a/docs/contributing/more_note_apps.md b/docs/contributing/more_note_apps.md index e395f474..997d8ee5 100644 --- a/docs/contributing/more_note_apps.md +++ b/docs/contributing/more_note_apps.md @@ -93,5 +93,4 @@ https://github.com/LucasMatuszewski/snb2md-recursive | Wunderlist | [script](https://github.com/eschlot/Wunderlist2Joplin) | dead? | | [Xiaomi Notes](https://i.mi.com/note/h5) | | account needed | | [XWiki](https://www.xwiki.org/) | [doc](https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/Exports) | | -| [Zim](https://zim-wiki.org/index.html) | - [doc](https://zim-wiki.org/manual/Help/Export.html) (Markdown)
- [script](https://gist.github.com/reagle/7418f54fb6e40fe8d925e1c3f5325076) | | | [Zotero](https://www.zotero.org/) | [doc](https://www.zotero.org/support/kb/exporting) | | diff --git a/docs/formats/zim.md b/docs/formats/zim.md new file mode 100644 index 00000000..7daa3f8b --- /dev/null +++ b/docs/formats/zim.md @@ -0,0 +1,25 @@ +This page describes how to convert notes from Zim Wiki to Markdown. + +## General Information + +- [Website](https://zim-wiki.org/) +- Typical extension: Folder with `.txt` files + +## Instructions + +1. [Install jimmy](../index.md#installation) +2. Convert to Markdown. Example: `jimmy-cli-linux zim/folder --format zim` +3. [Import to your app](../import_instructions.md) + +## Import Structure + +Zim does a good job in [exporting to Markdown](https://zim-wiki.org/manual/Help/Export.html). If the built-in export is fine for you, you don't need to use Jimmy. + +Jimmy doesn't use pandoc for conversion and applies some additional tweaks: + +- Consistently use ATX style headings (starting with `#`). +- Consistently use spaces instea of tabs. +- Page title and creation date are removed from the note body. They are instead stored in the metadata respectively the filename. The metadata can be included by a front matter. +- Convert Zim checklists to Markdown checklists (`- [ ]`) instead of Markdown lists with signs (`- ☐`). The checklist states are converted as described below: + - Done and not done are converted to `- [x]`. + - All other states are converted to `-[ ]`. diff --git a/docs/index.md b/docs/index.md index 84b63eba..73792d09 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,15 +31,16 @@ Alternative installation options: ```mermaid flowchart LR - A[App 1] -->|Backup| D - B[App 2] -->|Export| D - C[Filesystem] --> D - D(ZIP archive/JSON/folder) --> E - E{jimmy} --> F(Markdown + Frontmatter) - F -->|Import| G[Joplin] - F -->|Import| H[Obsidian] - F --> I[...] - F --> J[Editor, e. g. VSCode] + A[App 1] -->|Backup| M + B[App 2] -->|Export| M + C[...] --> M + D[Filesystem] --> M + M(ZIP archive/JSON/Folder) --> N + N{jimmy} --> O(Markdown + Frontmatter) + O -->|Import| P[Joplin] + O -->|Import| Q[Obsidian] + O --> R[...] + O --> S[Editor, e. g. VSCode] ``` 1. Export/backup notes from your note application diff --git a/mkdocs.yml b/mkdocs.yml index 5c17d798..1ba85c1d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -82,6 +82,7 @@ nav: - vCard: formats/vcard.md # - xit: formats/xit.md - Zettelkasten: formats/zettelkasten.md + - Zim: formats/zim.md - Zoho Notebook: formats/zoho_notebook.md - Import Instructions: import_instructions.md - Additional features: diff --git a/src/formats/zim.py b/src/formats/zim.py new file mode 100644 index 00000000..7ef8608e --- /dev/null +++ b/src/formats/zim.py @@ -0,0 +1,105 @@ +"""Convert TiddlyWiki notes to the intermediate format.""" + +import datetime as dt +from pathlib import Path +import re + +import common +import converter +import intermediate_format as imf +import markdown_lib.common +from markdown_lib.zim import zim_to_md + + +ZIM_IMAGE_REGEX = re.compile(r"(\{\{(.*?)\}\})") + + +class Converter(converter.BaseConverter): + accept_folder = True + + def handle_zim_links(self, body: str) -> tuple[list, list]: + # https://zim-wiki.org/manual/Help/Links.html + # https://zim-wiki.org/manual/Help/Wiki_Syntax.html + note_links = [] + resources = [] + for _, url, description in markdown_lib.common.get_wikilink_links(body): + original_text = f"[[{url}]]" + if "/" in url: + # resource + # Links containing a '/' are considered links to external files + resource_path = common.find_file_recursively(self.root_path, url) + if resource_path is None: + continue + resources.append( + imf.Resource(resource_path, original_text, description or url) + ) + elif "?" in url: + # Links that contain a '?' are interwiki links + pass # interwiki links can't be resolved + elif url.startswith("#"): + # Links that start with a '#' are resolved as links + # within the page to a heading or an object + pass # they don't need to be resolved + else: + # Ignore other directives for now. + # TODO: Find a way to map them. Right now we only map by + # matching the original_id. + original_id = url.split(":")[-1].lstrip("+") + note_links.append( + imf.NoteLink(original_text, original_id, description or original_id) + ) + return resources, note_links + + def handle_zim_images(self, body: str) -> list[imf.Resource]: + images = [] + for original_text, image_link in ZIM_IMAGE_REGEX.findall(body): + image_link = Path(image_link) + images.append(imf.Resource(image_link, original_text, image_link.name)) + return images + + def convert_folder(self, folder: Path, parent: imf.Notebook): + for item in folder.iterdir(): + if item.is_dir(): + # notebook + new_parent = imf.Notebook(item.name) + self.convert_folder(item, new_parent) + parent.child_notebooks.append(new_parent) + continue + if item.name == "notebook.zim" or item.suffix.lower() != ".txt": + continue + + # note + title = item.stem.replace("_", " ") # underscores seem to be replaced + self.logger.debug(f'Converting note "{title}"') + + imf_note = imf.Note( + title, source_application=self.format, original_id=title + ) + + metadata, _, body = item.read_text(encoding="utf-8").split( + "\n\n", maxsplit=2 + ) + for line in metadata.split("\n"): + key, value = line.split(": ", maxsplit=1) + if key == "Creation-Date": + imf_note.created = dt.datetime.fromisoformat(value) + + imf_note.body = zim_to_md(body) + + resources, note_links = self.handle_zim_links(imf_note.body) + imf_note.resources = resources + imf_note.note_links = note_links + + imf_note.resources.extend(self.handle_zim_images(imf_note.body)) + + # tags: https://zim-wiki.org/manual/Help/Tags.html + # TODO: exclude invalid characters + imf_note.tags = [ + imf.Tag(tag) for tag in markdown_lib.common.get_inline_tags(body, ["@"]) + ] + + parent.child_notes.append(imf_note) + + def convert(self, file_or_folder: Path): + self.root_path = file_or_folder + self.convert_folder(file_or_folder, self.root_notebook) diff --git a/src/importer.py b/src/importer.py index 6d632752..e61427cc 100644 --- a/src/importer.py +++ b/src/importer.py @@ -307,7 +307,9 @@ def update_note_links(self, note: imf.Note): if new_path is None: LOGGER.debug( f'Note "{note.title}": ' - f'could not find linked note: "{note_link.original_text}"' + f'could not find linked note: "{note_link.original_text}"', + # prevent [[]] syntax titles to be handled as markup + extra={"markup": None}, ) continue diff --git a/src/intermediate_format.py b/src/intermediate_format.py index 139f499b..ef38577b 100644 --- a/src/intermediate_format.py +++ b/src/intermediate_format.py @@ -37,6 +37,8 @@ class Resource: path: Path | None = None def __post_init__(self): + # resolve the user directory to prevent issues with puremagic + self.filename = self.filename.expanduser() # We can't simply match by extension, because sometimes the files/images # are stored as binary blob without extension. self.is_image = common.is_image(self.filename) diff --git a/src/markdown_lib/common.py b/src/markdown_lib/common.py index a44e71d0..0aa4b077 100644 --- a/src/markdown_lib/common.py +++ b/src/markdown_lib/common.py @@ -196,6 +196,7 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]: # markdown output formats: # https://pandoc.org/chunkedhtml-demo/8.22-markdown-variants.html # Don't use "commonmark_x". There would be too many noise. +# fmt: off PANDOC_OUTPUT_FORMAT = ( "markdown_strict" "+pipe_tables" @@ -203,7 +204,29 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]: "+task_lists" "-raw_html" ) +# fmt:on def markup_to_markdown(text: str, format_: str = "html") -> str: return pypandoc.convert_text(text, PANDOC_OUTPUT_FORMAT, format=format_) + + +# Problem: "//" is part of many URI (between scheme and host). +# We need to exclude them to prevent unwanted conversions. +# https://en.wikipedia.org/wiki/List_of_URI_schemes +schemes = [ + "file", + "ftp", + "http", + "https", + "imap", + "irc", + "udp", + "tcp", + "ntp", + "app", + "s3", +] +NEG_LOOKBEHINDS = "".join(f"(? seems to be even slower # - use regex instead of chaining multiline_quote_re = re.compile(r"<<<\n([\S\s]*?)\n<<<(.*)") -horizontal_line_re = re.compile(r"^-{3,}$", re.MULTILINE) link_re = re.compile(r"\[(ext|img)?.*\[(.*)\]\]") list_re = re.compile(r"^([*#>]+) ", re.MULTILINE) table_row_re = re.compile(r"\|(.*?)\|([kchf])?\n") -# Problem: "//" is part of many URI (between scheme and host). -# We need to exclude them to prevent unwanted conversions. -# https://en.wikipedia.org/wiki/List_of_URI_schemes -schemes = [ - "file", - "ftp", - "http", - "https", - "imap", - "irc", - "udp", - "tcp", - "ntp", - "app", - "s3", -] -NEG_LOOKBEHINDS = "".join(f"(?*x])\] ", re.MULTILINE) + + +def quote(source_tag, target_tag): + """Conversion of a quoted string. I. e. with the same start and end tags.""" + + def to_md(_, t): # noqa + return target_tag + t[0] + target_tag + + return pp.QuotedString(source_tag).set_parse_action(to_md) + + +def subscript(): + def to_md(_, t): # noqa + return "~" + t[0] + "~" + + return pp.QuotedString("_{", endQuoteChar="}").set_parse_action(to_md) + + +def superscript(): + def to_md(_, t): # noqa + return "^" + t[0] + "^" + + return pp.QuotedString("^{", endQuoteChar="}").set_parse_action(to_md) + + +def italic(): + def to_md(_, t): # noqa + return "*" + t[0][0] + "*" + + return pp.Regex( + markdown_lib.common.double_slash_re, as_group_list=True + ).set_parse_action(to_md) + + +def horizontal_line(): + return pp.Regex(markdown_lib.common.horizontal_line_re).set_parse_action( + lambda: "\n---\n" + ) + + +def heading(): + def to_md(_, t): # noqa + return "#" * (7 - len(t[0][0])) + " " + t[0][1] + + return pp.Regex(heading_re, as_group_list=True).set_parse_action(to_md) + + +def checklist(): + def to_md(_, t): # noqa + list_char = "x" if t[0][1] in ("*", "x") else " " + return f"{t[0][0]}- [{list_char}] " + + return pp.Regex(checklist_re, as_group_list=True).set_parse_action(to_md) + + +def zim_to_md(zim_text: str) -> str: + r""" + Main Zim Wiki to Markdown conversion function. + + >>> zim_to_md("''monospace'' **bold**") + '`monospace` **bold**' + >>> zim_to_md("super^{script}, sub_{script}") + 'super^script^, sub~script~' + >>> zim_to_md("====== heading 1 ======") + '# heading 1' + >>> zim_to_md("== heading5 ==") + '##### heading5' + >>> zim_to_md("'''\nsome code\nblock\n'''") + '```\nsome code\nblock\n```' + >>> zim_to_md("[ ] unchecked\n[x] not done") + '- [ ] unchecked\n- [x] not done' + >>> zim_to_md("[ ] u\n [>] np\n [*] nd\n[x] nd") + '- [ ] u\n - [ ] np\n - [x] nd\n- [x] nd' + >>> zim_to_md("* lvl1\n\t* lvl2\n\t* lvl2\n* lvl1") + '* lvl1\n * lvl2\n * lvl2\n* lvl1' + """ + zim_markup = ( + pp.Literal("'''").set_parse_action(lambda: "```") + # text formatting + | quote("''", "`") + | italic() + | subscript() + | superscript() + # + | horizontal_line() + | heading() + | checklist() + ) + + # TODO: str.translate() seems to be fastest + # https://stackoverflow.com/a/8958372 + zim_text = zim_text.replace("\t", " " * 4) + return zim_markup.transform_string(zim_text) diff --git a/test/data b/test/data index ccc4b4ba..0240cf4a 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit ccc4b4ba0b4c4c841290669634045a2edad25a4b +Subproject commit 0240cf4a241e007294758a31a8ba6e548e3a9782 diff --git a/test/test_convert.py b/test/test_convert.py index 82536b3c..1c9e1ee6 100644 --- a/test/test_convert.py +++ b/test/test_convert.py @@ -126,6 +126,7 @@ def compare_dirs(dir1: Path, dir2: Path): [["tomboy_ng/test_1/gnote"]], [["tomboy_ng/test_2/tomboy-ng"]], [["zettelkasten/test_1/test_zettelkasten.zkn3"]], + [["zim/test_1/notebook"]], [["zoho_notebook/test_1/Notebook_14Apr2024_1300_html.zip"]], ], name_func=name_func,