Skip to content

Commit

Permalink
add support for zim
Browse files Browse the repository at this point in the history
  • Loading branch information
marph91 committed Oct 6, 2024
1 parent e39b513 commit 351bfae
Show file tree
Hide file tree
Showing 12 changed files with 285 additions and 33 deletions.
1 change: 0 additions & 1 deletion docs/contributing/more_note_apps.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,5 +93,4 @@ https://github.com/LucasMatuszewski/snb2md-recursive
| Wunderlist | [script](https://github.com/eschlot/Wunderlist2Joplin) | dead? |
| [Xiaomi Notes](https://i.mi.com/note/h5) | | account needed |
| [XWiki](https://www.xwiki.org/) | [doc](https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/Exports) | |
| [Zim](https://zim-wiki.org/index.html) | - [doc](https://zim-wiki.org/manual/Help/Export.html) (Markdown) <br>- [script](https://gist.github.com/reagle/7418f54fb6e40fe8d925e1c3f5325076) | |
| [Zotero](https://www.zotero.org/) | [doc](https://www.zotero.org/support/kb/exporting) | |
25 changes: 25 additions & 0 deletions docs/formats/zim.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
This page describes how to convert notes from Zim Wiki to Markdown.

## General Information

- [Website](https://zim-wiki.org/)
- Typical extension: Folder with `.txt` files

## Instructions

1. [Install jimmy](../index.md#installation)
2. Convert to Markdown. Example: `jimmy-cli-linux zim/folder --format zim`
3. [Import to your app](../import_instructions.md)

## Import Structure

Zim does a good job in [exporting to Markdown](https://zim-wiki.org/manual/Help/Export.html). If the built-in export is fine for you, you don't need to use Jimmy.

Jimmy doesn't use pandoc for conversion and applies some additional tweaks:

- Consistently use ATX style headings (starting with `#`).
- Consistently use spaces instea of tabs.
- Page title and creation date are removed from the note body. They are instead stored in the metadata respectively the filename. The metadata can be included by a front matter.
- Convert Zim checklists to Markdown checklists (`- [ ]`) instead of Markdown lists with signs (`- ☐`). The checklist states are converted as described below:
- Done and not done are converted to `- [x]`.
- All other states are converted to `-[ ]`.
19 changes: 10 additions & 9 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ Alternative installation options:

```mermaid
flowchart LR
A[App 1] -->|Backup| D
B[App 2] -->|Export| D
C[Filesystem] --> D
D(ZIP archive/JSON/folder) --> E
E{jimmy} --> F(Markdown + Frontmatter)
F -->|Import| G[Joplin]
F -->|Import| H[Obsidian]
F --> I[...]
F --> J[Editor, e. g. VSCode]
A[App 1] -->|Backup| M
B[App 2] -->|Export| M
C[...] --> M
D[Filesystem] --> M
M(ZIP archive/JSON/Folder) --> N
N{jimmy} --> O(Markdown + Frontmatter)
O -->|Import| P[Joplin]
O -->|Import| Q[Obsidian]
O --> R[...]
O --> S[Editor, e. g. VSCode]
```

1. Export/backup notes from your note application
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ nav:
- vCard: formats/vcard.md
# - xit: formats/xit.md
- Zettelkasten: formats/zettelkasten.md
- Zim: formats/zim.md
- Zoho Notebook: formats/zoho_notebook.md
- Import Instructions: import_instructions.md
- Additional features:
Expand Down
105 changes: 105 additions & 0 deletions src/formats/zim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""Convert TiddlyWiki notes to the intermediate format."""

import datetime as dt
from pathlib import Path
import re

import common
import converter
import intermediate_format as imf
import markdown_lib.common
from markdown_lib.zim import zim_to_md


ZIM_IMAGE_REGEX = re.compile(r"(\{\{(.*?)\}\})")


class Converter(converter.BaseConverter):
accept_folder = True

def handle_zim_links(self, body: str) -> tuple[list, list]:
# https://zim-wiki.org/manual/Help/Links.html
# https://zim-wiki.org/manual/Help/Wiki_Syntax.html
note_links = []
resources = []
for _, url, description in markdown_lib.common.get_wikilink_links(body):
original_text = f"[[{url}]]"
if "/" in url:
# resource
# Links containing a '/' are considered links to external files
resource_path = common.find_file_recursively(self.root_path, url)
if resource_path is None:
continue
resources.append(
imf.Resource(resource_path, original_text, description or url)
)
elif "?" in url:
# Links that contain a '?' are interwiki links
pass # interwiki links can't be resolved
elif url.startswith("#"):
# Links that start with a '#' are resolved as links
# within the page to a heading or an object
pass # they don't need to be resolved
else:
# Ignore other directives for now.
# TODO: Find a way to map them. Right now we only map by
# matching the original_id.
original_id = url.split(":")[-1].lstrip("+")
note_links.append(
imf.NoteLink(original_text, original_id, description or original_id)
)
return resources, note_links

def handle_zim_images(self, body: str) -> list[imf.Resource]:
images = []
for original_text, image_link in ZIM_IMAGE_REGEX.findall(body):
image_link = Path(image_link)
images.append(imf.Resource(image_link, original_text, image_link.name))
return images

def convert_folder(self, folder: Path, parent: imf.Notebook):
for item in folder.iterdir():
if item.is_dir():
# notebook
new_parent = imf.Notebook(item.name)
self.convert_folder(item, new_parent)
parent.child_notebooks.append(new_parent)
continue
if item.name == "notebook.zim" or item.suffix.lower() != ".txt":
continue

# note
title = item.stem.replace("_", " ") # underscores seem to be replaced
self.logger.debug(f'Converting note "{title}"')

imf_note = imf.Note(
title, source_application=self.format, original_id=title
)

metadata, _, body = item.read_text(encoding="utf-8").split(
"\n\n", maxsplit=2
)
for line in metadata.split("\n"):
key, value = line.split(": ", maxsplit=1)
if key == "Creation-Date":
imf_note.created = dt.datetime.fromisoformat(value)

imf_note.body = zim_to_md(body)

resources, note_links = self.handle_zim_links(imf_note.body)
imf_note.resources = resources
imf_note.note_links = note_links

imf_note.resources.extend(self.handle_zim_images(imf_note.body))

# tags: https://zim-wiki.org/manual/Help/Tags.html
# TODO: exclude invalid characters
imf_note.tags = [
imf.Tag(tag) for tag in markdown_lib.common.get_inline_tags(body, ["@"])
]

parent.child_notes.append(imf_note)

def convert(self, file_or_folder: Path):
self.root_path = file_or_folder
self.convert_folder(file_or_folder, self.root_notebook)
4 changes: 3 additions & 1 deletion src/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,9 @@ def update_note_links(self, note: imf.Note):
if new_path is None:
LOGGER.debug(
f'Note "{note.title}": '
f'could not find linked note: "{note_link.original_text}"'
f'could not find linked note: "{note_link.original_text}"',
# prevent [[]] syntax titles to be handled as markup
extra={"markup": None},
)
continue

Expand Down
2 changes: 2 additions & 0 deletions src/intermediate_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class Resource:
path: Path | None = None

def __post_init__(self):
# resolve the user directory to prevent issues with puremagic
self.filename = self.filename.expanduser()
# We can't simply match by extension, because sometimes the files/images
# are stored as binary blob without extension.
self.is_image = common.is_image(self.filename)
Expand Down
23 changes: 23 additions & 0 deletions src/markdown_lib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,37 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
# markdown output formats:
# https://pandoc.org/chunkedhtml-demo/8.22-markdown-variants.html
# Don't use "commonmark_x". There would be too many noise.
# fmt: off
PANDOC_OUTPUT_FORMAT = (
"markdown_strict"
"+pipe_tables"
"+backtick_code_blocks"
"+task_lists"
"-raw_html"
)
# fmt:on


def markup_to_markdown(text: str, format_: str = "html") -> str:
return pypandoc.convert_text(text, PANDOC_OUTPUT_FORMAT, format=format_)


# Problem: "//" is part of many URI (between scheme and host).
# We need to exclude them to prevent unwanted conversions.
# https://en.wikipedia.org/wiki/List_of_URI_schemes
schemes = [
"file",
"ftp",
"http",
"https",
"imap",
"irc",
"udp",
"tcp",
"ntp",
"app",
"s3",
]
NEG_LOOKBEHINDS = "".join(f"(?<!{scheme}:)" for scheme in schemes)
double_slash_re = re.compile(rf"{NEG_LOOKBEHINDS}\/\/(.*?){NEG_LOOKBEHINDS}\/\/")
horizontal_line_re = re.compile(r"^-{3,}$", re.MULTILINE)
27 changes: 6 additions & 21 deletions src/markdown_lib/tiddlywiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,9 @@
# - pp.ParserElement.enable_packrat() -> seems to be even slower
# - use regex instead of chaining
multiline_quote_re = re.compile(r"<<<\n([\S\s]*?)\n<<<(.*)")
horizontal_line_re = re.compile(r"^-{3,}$", re.MULTILINE)
link_re = re.compile(r"\[(ext|img)?.*\[(.*)\]\]")
list_re = re.compile(r"^([*#>]+) ", re.MULTILINE)
table_row_re = re.compile(r"\|(.*?)\|([kchf])?\n")
# Problem: "//" is part of many URI (between scheme and host).
# We need to exclude them to prevent unwanted conversions.
# https://en.wikipedia.org/wiki/List_of_URI_schemes
schemes = [
"file",
"ftp",
"http",
"https",
"imap",
"irc",
"udp",
"tcp",
"ntp",
"app",
"s3",
]
NEG_LOOKBEHINDS = "".join(f"(?<!{scheme}:)" for scheme in schemes)
italic_re = re.compile(rf"{NEG_LOOKBEHINDS}\/\/(.*?){NEG_LOOKBEHINDS}\/\/")


def dash():
Expand Down Expand Up @@ -70,11 +51,15 @@ def italic():
def to_md(_, t): # noqa
return "*" + t[0][0] + "*"

return pp.Regex(italic_re, as_group_list=True).set_parse_action(to_md)
return pp.Regex(
markdown_lib.common.double_slash_re, as_group_list=True
).set_parse_action(to_md)


def horizontal_line():
return pp.Regex(horizontal_line_re).set_parse_action(lambda: "---")
return pp.Regex(markdown_lib.common.horizontal_line_re).set_parse_action(
lambda: "---"
)


def link():
Expand Down
108 changes: 108 additions & 0 deletions src/markdown_lib/zim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Convert Zim Wiki to Markdown."""

import re

import pyparsing as pp

import markdown_lib.common


# Prevent spaces, tabs and newlines from being stripped.
pp.ParserElement.set_default_whitespace_chars("")


heading_re = re.compile(r"(={1,6}) (.*?) ={1,6}")
checklist_re = re.compile(r"^( *)\[([ <>*x])\] ", re.MULTILINE)


def quote(source_tag, target_tag):
"""Conversion of a quoted string. I. e. with the same start and end tags."""

def to_md(_, t): # noqa
return target_tag + t[0] + target_tag

return pp.QuotedString(source_tag).set_parse_action(to_md)


def subscript():
def to_md(_, t): # noqa
return "~" + t[0] + "~"

return pp.QuotedString("_{", endQuoteChar="}").set_parse_action(to_md)


def superscript():
def to_md(_, t): # noqa
return "^" + t[0] + "^"

return pp.QuotedString("^{", endQuoteChar="}").set_parse_action(to_md)


def italic():
def to_md(_, t): # noqa
return "*" + t[0][0] + "*"

return pp.Regex(
markdown_lib.common.double_slash_re, as_group_list=True
).set_parse_action(to_md)


def horizontal_line():
return pp.Regex(markdown_lib.common.horizontal_line_re).set_parse_action(
lambda: "\n---\n"
)


def heading():
def to_md(_, t): # noqa
return "#" * (7 - len(t[0][0])) + " " + t[0][1]

return pp.Regex(heading_re, as_group_list=True).set_parse_action(to_md)


def checklist():
def to_md(_, t): # noqa
list_char = "x" if t[0][1] in ("*", "x") else " "
return f"{t[0][0]}- [{list_char}] "

return pp.Regex(checklist_re, as_group_list=True).set_parse_action(to_md)


def zim_to_md(zim_text: str) -> str:
r"""
Main Zim Wiki to Markdown conversion function.
>>> zim_to_md("''monospace'' **bold**")
'`monospace` **bold**'
>>> zim_to_md("super^{script}, sub_{script}")
'super^script^, sub~script~'
>>> zim_to_md("====== heading 1 ======")
'# heading 1'
>>> zim_to_md("== heading5 ==")
'##### heading5'
>>> zim_to_md("'''\nsome code\nblock\n'''")
'```\nsome code\nblock\n```'
>>> zim_to_md("[ ] unchecked\n[x] not done")
'- [ ] unchecked\n- [x] not done'
>>> zim_to_md("[ ] u\n [>] np\n [*] nd\n[x] nd")
'- [ ] u\n - [ ] np\n - [x] nd\n- [x] nd'
>>> zim_to_md("* lvl1\n\t* lvl2\n\t* lvl2\n* lvl1")
'* lvl1\n * lvl2\n * lvl2\n* lvl1'
"""
zim_markup = (
pp.Literal("'''").set_parse_action(lambda: "```")
# text formatting
| quote("''", "`")
| italic()
| subscript()
| superscript()
#
| horizontal_line()
| heading()
| checklist()
)

# TODO: str.translate() seems to be fastest
# https://stackoverflow.com/a/8958372
zim_text = zim_text.replace("\t", " " * 4)
return zim_markup.transform_string(zim_text)
1 change: 1 addition & 0 deletions test/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def compare_dirs(dir1: Path, dir2: Path):
[["tomboy_ng/test_1/gnote"]],
[["tomboy_ng/test_2/tomboy-ng"]],
[["zettelkasten/test_1/test_zettelkasten.zkn3"]],
[["zim/test_1/notebook"]],
[["zoho_notebook/test_1/Notebook_14Apr2024_1300_html.zip"]],
],
name_func=name_func,
Expand Down

0 comments on commit 351bfae

Please sign in to comment.