From 351bfae4c1a3c5893f94e27d8b115322eeb2c891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20D=C3=B6rfelt?= <martin.d@andix.de>
Date: Sun, 6 Oct 2024 15:00:03 +0200
Subject: [PATCH] add support for zim

---
 docs/contributing/more_note_apps.md |   1 -
 docs/formats/zim.md                 |  25 +++++++
 docs/index.md                       |  19 ++---
 mkdocs.yml                          |   1 +
 src/formats/zim.py                  | 105 +++++++++++++++++++++++++++
 src/importer.py                     |   4 +-
 src/intermediate_format.py          |   2 +
 src/markdown_lib/common.py          |  23 ++++++
 src/markdown_lib/tiddlywiki.py      |  27 ++-----
 src/markdown_lib/zim.py             | 108 ++++++++++++++++++++++++++++
 test/data                           |   2 +-
 test/test_convert.py                |   1 +
 12 files changed, 285 insertions(+), 33 deletions(-)
 create mode 100644 docs/formats/zim.md
 create mode 100644 src/formats/zim.py
 create mode 100644 src/markdown_lib/zim.py
diff --git a/docs/contributing/more_note_apps.md b/docs/contributing/more_note_apps.md
index e395f474..997d8ee5 100644
--- a/docs/contributing/more_note_apps.md
+++ b/docs/contributing/more_note_apps.md
@@ -93,5 +93,4 @@ https://github.com/LucasMatuszewski/snb2md-recursive
 | Wunderlist | [script](https://github.com/eschlot/Wunderlist2Joplin) | dead? |
 | [Xiaomi Notes](https://i.mi.com/note/h5) |     | account needed |
 | [XWiki](https://www.xwiki.org/) | [doc](https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/Exports) |     |
-| [Zim](https://zim-wiki.org/index.html) | - [doc](https://zim-wiki.org/manual/Help/Export.html) (Markdown) <br>- [script](https://gist.github.com/reagle/7418f54fb6e40fe8d925e1c3f5325076) |     |
 | [Zotero](https://www.zotero.org/) | [doc](https://www.zotero.org/support/kb/exporting) |     |
diff --git a/docs/formats/zim.md b/docs/formats/zim.md
new file mode 100644
index 00000000..7daa3f8b
--- /dev/null
+++ b/docs/formats/zim.md
@@ -0,0 +1,25 @@
+This page describes how to convert notes from Zim Wiki to Markdown.
+
+## General Information
+
+- [Website](https://zim-wiki.org/)
+- Typical extension: Folder with `.txt` files
+
+## Instructions
+
+1. [Install jimmy](../index.md#installation)
+2. Convert to Markdown. Example: `jimmy-cli-linux zim/folder --format zim`
+3. [Import to your app](../import_instructions.md)
+
+## Import Structure
+
+Zim does a good job in [exporting to Markdown](https://zim-wiki.org/manual/Help/Export.html). If the built-in export is fine for you, you don't need to use Jimmy.
+
+Jimmy doesn't use pandoc for conversion and applies some additional tweaks:
+
+- Consistently use ATX style headings (starting with `#`).
+- Consistently use spaces instea of tabs.
+- Page title and creation date are removed from the note body. They are instead stored in the metadata respectively the filename. The metadata can be included by a front matter.
+- Convert Zim checklists to Markdown checklists (`- [ ]`) instead of Markdown lists with signs (`- ☐`). The checklist states are converted as described below:
+    - Done and not done are converted to `- [x]`.
+    - All other states are converted to `-[ ]`.
diff --git a/docs/index.md b/docs/index.md
index 84b63eba..73792d09 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -31,15 +31,16 @@ Alternative installation options:
 
 ```mermaid
 flowchart LR
-    A[App 1] -->|Backup| D
-    B[App 2] -->|Export| D
-    C[Filesystem] --> D
-    D(ZIP archive/JSON/folder) --> E
-    E{jimmy} --> F(Markdown + Frontmatter)
-    F -->|Import| G[Joplin]
-    F -->|Import| H[Obsidian]
-    F --> I[...]
-    F --> J[Editor, e. g. VSCode]
+    A[App 1] -->|Backup| M
+    B[App 2] -->|Export| M
+    C[...] --> M
+    D[Filesystem] --> M
+    M(ZIP archive/JSON/Folder) --> N
+    N{jimmy} --> O(Markdown + Frontmatter)
+    O -->|Import| P[Joplin]
+    O -->|Import| Q[Obsidian]
+    O --> R[...]
+    O --> S[Editor, e. g. VSCode]
 ```
 
 1. Export/backup notes from your note application
diff --git a/mkdocs.yml b/mkdocs.yml
index 5c17d798..1ba85c1d 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -82,6 +82,7 @@ nav:
     - vCard: formats/vcard.md
     # - xit: formats/xit.md
     - Zettelkasten: formats/zettelkasten.md
+    - Zim: formats/zim.md
     - Zoho Notebook: formats/zoho_notebook.md
   - Import Instructions: import_instructions.md
   - Additional features:
diff --git a/src/formats/zim.py b/src/formats/zim.py
new file mode 100644
index 00000000..7ef8608e
--- /dev/null
+++ b/src/formats/zim.py
@@ -0,0 +1,105 @@
+"""Convert TiddlyWiki notes to the intermediate format."""
+
+import datetime as dt
+from pathlib import Path
+import re
+
+import common
+import converter
+import intermediate_format as imf
+import markdown_lib.common
+from markdown_lib.zim import zim_to_md
+
+
+ZIM_IMAGE_REGEX = re.compile(r"(\{\{(.*?)\}\})")
+
+
+class Converter(converter.BaseConverter):
+    accept_folder = True
+
+    def handle_zim_links(self, body: str) -> tuple[list, list]:
+        # https://zim-wiki.org/manual/Help/Links.html
+        # https://zim-wiki.org/manual/Help/Wiki_Syntax.html
+        note_links = []
+        resources = []
+        for _, url, description in markdown_lib.common.get_wikilink_links(body):
+            original_text = f"[[{url}]]"
+            if "/" in url:
+                # resource
+                # Links containing a '/' are considered links to external files
+                resource_path = common.find_file_recursively(self.root_path, url)
+                if resource_path is None:
+                    continue
+                resources.append(
+                    imf.Resource(resource_path, original_text, description or url)
+                )
+            elif "?" in url:
+                # Links that contain a '?' are interwiki links
+                pass  # interwiki links can't be resolved
+            elif url.startswith("#"):
+                # Links that start with a '#' are resolved as links
+                # within the page to a heading or an object
+                pass  # they don't need to be resolved
+            else:
+                # Ignore other directives for now.
+                # TODO: Find a way to map them. Right now we only map by
+                # matching the original_id.
+                original_id = url.split(":")[-1].lstrip("+")
+                note_links.append(
+                    imf.NoteLink(original_text, original_id, description or original_id)
+                )
+        return resources, note_links
+
+    def handle_zim_images(self, body: str) -> list[imf.Resource]:
+        images = []
+        for original_text, image_link in ZIM_IMAGE_REGEX.findall(body):
+            image_link = Path(image_link)
+            images.append(imf.Resource(image_link, original_text, image_link.name))
+        return images
+
+    def convert_folder(self, folder: Path, parent: imf.Notebook):
+        for item in folder.iterdir():
+            if item.is_dir():
+                # notebook
+                new_parent = imf.Notebook(item.name)
+                self.convert_folder(item, new_parent)
+                parent.child_notebooks.append(new_parent)
+                continue
+            if item.name == "notebook.zim" or item.suffix.lower() != ".txt":
+                continue
+
+            # note
+            title = item.stem.replace("_", " ")  # underscores seem to be replaced
+            self.logger.debug(f'Converting note "{title}"')
+
+            imf_note = imf.Note(
+                title, source_application=self.format, original_id=title
+            )
+
+            metadata, _, body = item.read_text(encoding="utf-8").split(
+                "\n\n", maxsplit=2
+            )
+            for line in metadata.split("\n"):
+                key, value = line.split(": ", maxsplit=1)
+                if key == "Creation-Date":
+                    imf_note.created = dt.datetime.fromisoformat(value)
+
+            imf_note.body = zim_to_md(body)
+
+            resources, note_links = self.handle_zim_links(imf_note.body)
+            imf_note.resources = resources
+            imf_note.note_links = note_links
+
+            imf_note.resources.extend(self.handle_zim_images(imf_note.body))
+
+            # tags: https://zim-wiki.org/manual/Help/Tags.html
+            # TODO: exclude invalid characters
+            imf_note.tags = [
+                imf.Tag(tag) for tag in markdown_lib.common.get_inline_tags(body, ["@"])
+            ]
+
+            parent.child_notes.append(imf_note)
+
+    def convert(self, file_or_folder: Path):
+        self.root_path = file_or_folder
+        self.convert_folder(file_or_folder, self.root_notebook)
diff --git a/src/importer.py b/src/importer.py
index 6d632752..e61427cc 100644
--- a/src/importer.py
+++ b/src/importer.py
@@ -307,7 +307,9 @@ def update_note_links(self, note: imf.Note):
             if new_path is None:
                 LOGGER.debug(
                     f'Note "{note.title}": '
-                    f'could not find linked note: "{note_link.original_text}"'
+                    f'could not find linked note: "{note_link.original_text}"',
+                    # prevent [[]] syntax titles to be handled as markup
+                    extra={"markup": None},
                 )
                 continue
 
diff --git a/src/intermediate_format.py b/src/intermediate_format.py
index 139f499b..ef38577b 100644
--- a/src/intermediate_format.py
+++ b/src/intermediate_format.py
@@ -37,6 +37,8 @@ class Resource:
     path: Path | None = None
 
     def __post_init__(self):
+        # resolve the user directory to prevent issues with puremagic
+        self.filename = self.filename.expanduser()
         # We can't simply match by extension, because sometimes the files/images
         # are stored as binary blob without extension.
         self.is_image = common.is_image(self.filename)
diff --git a/src/markdown_lib/common.py b/src/markdown_lib/common.py
index a44e71d0..0aa4b077 100644
--- a/src/markdown_lib/common.py
+++ b/src/markdown_lib/common.py
@@ -196,6 +196,7 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
 # markdown output formats:
 # https://pandoc.org/chunkedhtml-demo/8.22-markdown-variants.html
 # Don't use "commonmark_x". There would be too many noise.
+# fmt: off
 PANDOC_OUTPUT_FORMAT = (
     "markdown_strict"
     "+pipe_tables"
@@ -203,7 +204,29 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
     "+task_lists"
     "-raw_html"
 )
+# fmt:on
 
 
 def markup_to_markdown(text: str, format_: str = "html") -> str:
     return pypandoc.convert_text(text, PANDOC_OUTPUT_FORMAT, format=format_)
+
+
+# Problem: "//" is part of many URI (between scheme and host).
+# We need to exclude them to prevent unwanted conversions.
+# https://en.wikipedia.org/wiki/List_of_URI_schemes
+schemes = [
+    "file",
+    "ftp",
+    "http",
+    "https",
+    "imap",
+    "irc",
+    "udp",
+    "tcp",
+    "ntp",
+    "app",
+    "s3",
+]
+NEG_LOOKBEHINDS = "".join(f"(?<!{scheme}:)" for scheme in schemes)
+double_slash_re = re.compile(rf"{NEG_LOOKBEHINDS}\/\/(.*?){NEG_LOOKBEHINDS}\/\/")
+horizontal_line_re = re.compile(r"^-{3,}$", re.MULTILINE)
diff --git a/src/markdown_lib/tiddlywiki.py b/src/markdown_lib/tiddlywiki.py
index f8ebfd52..e401a74b 100644
--- a/src/markdown_lib/tiddlywiki.py
+++ b/src/markdown_lib/tiddlywiki.py
@@ -19,28 +19,9 @@
 # - pp.ParserElement.enable_packrat() -> seems to be even slower
 # - use regex instead of chaining
 multiline_quote_re = re.compile(r"<<<\n([\S\s]*?)\n<<<(.*)")
-horizontal_line_re = re.compile(r"^-{3,}$", re.MULTILINE)
 link_re = re.compile(r"\[(ext|img)?.*\[(.*)\]\]")
 list_re = re.compile(r"^([*#>]+) ", re.MULTILINE)
 table_row_re = re.compile(r"\|(.*?)\|([kchf])?\n")
-# Problem: "//" is part of many URI (between scheme and host).
-# We need to exclude them to prevent unwanted conversions.
-# https://en.wikipedia.org/wiki/List_of_URI_schemes
-schemes = [
-    "file",
-    "ftp",
-    "http",
-    "https",
-    "imap",
-    "irc",
-    "udp",
-    "tcp",
-    "ntp",
-    "app",
-    "s3",
-]
-NEG_LOOKBEHINDS = "".join(f"(?<!{scheme}:)" for scheme in schemes)
-italic_re = re.compile(rf"{NEG_LOOKBEHINDS}\/\/(.*?){NEG_LOOKBEHINDS}\/\/")
 
 
 def dash():
@@ -70,11 +51,15 @@ def italic():
     def to_md(_, t):  # noqa
         return "*" + t[0][0] + "*"
 
-    return pp.Regex(italic_re, as_group_list=True).set_parse_action(to_md)
+    return pp.Regex(
+        markdown_lib.common.double_slash_re, as_group_list=True
+    ).set_parse_action(to_md)
 
 
 def horizontal_line():
-    return pp.Regex(horizontal_line_re).set_parse_action(lambda: "---")
+    return pp.Regex(markdown_lib.common.horizontal_line_re).set_parse_action(
+        lambda: "---"
+    )
 
 
 def link():
diff --git a/src/markdown_lib/zim.py b/src/markdown_lib/zim.py
new file mode 100644
index 00000000..03eda4e9
--- /dev/null
+++ b/src/markdown_lib/zim.py
@@ -0,0 +1,108 @@
+"""Convert Zim Wiki to Markdown."""
+
+import re
+
+import pyparsing as pp
+
+import markdown_lib.common
+
+
+# Prevent spaces, tabs and newlines from being stripped.
+pp.ParserElement.set_default_whitespace_chars("")
+
+
+heading_re = re.compile(r"(={1,6}) (.*?) ={1,6}")
+checklist_re = re.compile(r"^( *)\[([ <>*x])\] ", re.MULTILINE)
+
+
+def quote(source_tag, target_tag):
+    """Conversion of a quoted string. I. e. with the same start and end tags."""
+
+    def to_md(_, t):  # noqa
+        return target_tag + t[0] + target_tag
+
+    return pp.QuotedString(source_tag).set_parse_action(to_md)
+
+
+def subscript():
+    def to_md(_, t):  # noqa
+        return "~" + t[0] + "~"
+
+    return pp.QuotedString("_{", endQuoteChar="}").set_parse_action(to_md)
+
+
+def superscript():
+    def to_md(_, t):  # noqa
+        return "^" + t[0] + "^"
+
+    return pp.QuotedString("^{", endQuoteChar="}").set_parse_action(to_md)
+
+
+def italic():
+    def to_md(_, t):  # noqa
+        return "*" + t[0][0] + "*"
+
+    return pp.Regex(
+        markdown_lib.common.double_slash_re, as_group_list=True
+    ).set_parse_action(to_md)
+
+
+def horizontal_line():
+    return pp.Regex(markdown_lib.common.horizontal_line_re).set_parse_action(
+        lambda: "\n---\n"
+    )
+
+
+def heading():
+    def to_md(_, t):  # noqa
+        return "#" * (7 - len(t[0][0])) + " " + t[0][1]
+
+    return pp.Regex(heading_re, as_group_list=True).set_parse_action(to_md)
+
+
+def checklist():
+    def to_md(_, t):  # noqa
+        list_char = "x" if t[0][1] in ("*", "x") else " "
+        return f"{t[0][0]}- [{list_char}] "
+
+    return pp.Regex(checklist_re, as_group_list=True).set_parse_action(to_md)
+
+
+def zim_to_md(zim_text: str) -> str:
+    r"""
+    Main Zim Wiki to Markdown conversion function.
+
+    >>> zim_to_md("''monospace'' **bold**")
+    '`monospace` **bold**'
+    >>> zim_to_md("super^{script}, sub_{script}")
+    'super^script^, sub~script~'
+    >>> zim_to_md("====== heading 1 ======")
+    '# heading 1'
+    >>> zim_to_md("== heading5 ==")
+    '##### heading5'
+    >>> zim_to_md("'''\nsome code\nblock\n'''")
+    '```\nsome code\nblock\n```'
+    >>> zim_to_md("[ ] unchecked\n[x] not done")
+    '- [ ] unchecked\n- [x] not done'
+    >>> zim_to_md("[ ] u\n    [>] np\n    [*] nd\n[x] nd")
+    '- [ ] u\n    - [ ] np\n    - [x] nd\n- [x] nd'
+    >>> zim_to_md("* lvl1\n\t* lvl2\n\t* lvl2\n* lvl1")
+    '* lvl1\n    * lvl2\n    * lvl2\n* lvl1'
+    """
+    zim_markup = (
+        pp.Literal("'''").set_parse_action(lambda: "```")
+        # text formatting
+        | quote("''", "`")
+        | italic()
+        | subscript()
+        | superscript()
+        #
+        | horizontal_line()
+        | heading()
+        | checklist()
+    )
+
+    # TODO: str.translate() seems to be fastest
+    # https://stackoverflow.com/a/8958372
+    zim_text = zim_text.replace("\t", " " * 4)
+    return zim_markup.transform_string(zim_text)
diff --git a/test/data b/test/data
index ccc4b4ba..0240cf4a 160000
--- a/test/data
+++ b/test/data
@@ -1 +1 @@
-Subproject commit ccc4b4ba0b4c4c841290669634045a2edad25a4b
+Subproject commit 0240cf4a241e007294758a31a8ba6e548e3a9782
diff --git a/test/test_convert.py b/test/test_convert.py
index 82536b3c..1c9e1ee6 100644
--- a/test/test_convert.py
+++ b/test/test_convert.py
@@ -126,6 +126,7 @@ def compare_dirs(dir1: Path, dir2: Path):
             [["tomboy_ng/test_1/gnote"]],
             [["tomboy_ng/test_2/tomboy-ng"]],
             [["zettelkasten/test_1/test_zettelkasten.zkn3"]],
+            [["zim/test_1/notebook"]],
             [["zoho_notebook/test_1/Notebook_14Apr2024_1300_html.zip"]],
         ],
         name_func=name_func,