Skip to content

Commit

Permalink
HTML: split between custom and common filters
Browse files Browse the repository at this point in the history
...and fix imports (always absolute)
  • Loading branch information
marph91 committed Jan 21, 2025
1 parent 8705dbc commit 381dec5
Show file tree
Hide file tree
Showing 15 changed files with 41 additions and 35 deletions.
2 changes: 1 addition & 1 deletion src/formats/cherrytree.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


LOGGER = logging.getLogger("jimmy")
Expand Down
2 changes: 1 addition & 1 deletion src/formats/dynalist.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


def handle_markdown_links(body: str, root_folder: Path) -> imf.NoteLinks:
Expand Down
2 changes: 1 addition & 1 deletion src/formats/joplin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


class ItemType(enum.IntEnum):
Expand Down
13 changes: 5 additions & 8 deletions src/formats/nimbus_note.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
import base64
from pathlib import Path

from bs4 import BeautifulSoup

import common
import converter
import intermediate_format as imf
import markdown_lib.common
import markdown_lib.html_preprocessing
import markdown_lib.html_filter


class Converter(converter.BaseConverter):
Expand Down Expand Up @@ -57,11 +55,10 @@ def convert_note(self, file_: Path, temp_folder: Path):

# HTML note seems to have the name "note.html" always
note_body_html = (temp_folder_note / "note.html").read_text(encoding="utf-8")

soup = BeautifulSoup(note_body_html, "html.parser")
markdown_lib.html_preprocessing.nimbus_note_streamline_lists(soup)

note_body_markdown = markdown_lib.common.markup_to_markdown(str(soup))
note_body_markdown = markdown_lib.common.markup_to_markdown(
note_body_html,
custom_filter=[markdown_lib.html_filter.nimbus_note_streamline_lists],
)
resources = self.handle_markdown_links(note_body_markdown, temp_folder_note)
note_imf = imf.Note(
title,
Expand Down
8 changes: 3 additions & 5 deletions src/formats/notion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from urllib.parse import unquote
import zipfile

from bs4 import BeautifulSoup

import common
import converter
import intermediate_format as imf
Expand Down Expand Up @@ -110,9 +108,9 @@ def convert_note(
# first line is title, second is whitespace
body = "\n".join(item.read_text(encoding="utf-8").split("\n")[2:])
else: # html
soup = BeautifulSoup(body, "html.parser")
markdown_lib.html_preprocessing.notion_streamline_lists(soup)
body = markdown_lib.common.markup_to_markdown(str(soup))
body = markdown_lib.common.markup_to_markdown(
body, custom_filter=[markdown_lib.html_filter.notion_streamline_lists]
)

# find links
resources, note_links = self.handle_markdown_links(body, item)
Expand Down
2 changes: 1 addition & 1 deletion src/formats/obsidian.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


class Converter(converter.BaseConverter):
Expand Down
2 changes: 1 addition & 1 deletion src/formats/rednotebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


class Converter(converter.BaseConverter):
Expand Down
2 changes: 1 addition & 1 deletion src/formats/simplenote.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


class Converter(converter.BaseConverter):
Expand Down
2 changes: 1 addition & 1 deletion src/formats/standard_notes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common
import common

LOGGER = logging.getLogger("jimmy")
Expand Down
10 changes: 8 additions & 2 deletions src/formats/synology_note_station.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common
import markdown_lib.html_filter


@dataclass
Expand Down Expand Up @@ -145,7 +146,12 @@ def convert_note(self, note_id, note_id_title_map):

note_links: imf.NoteLinks = []
if (content_html := note.get("content")) is not None:
content_markdown = markdown_lib.common.markup_to_markdown(content_html)
content_markdown = markdown_lib.common.markup_to_markdown(
content_html,
custom_filter=[
markdown_lib.html_filter.synology_note_station_fix_img_src
],
)
# note title only needed for debug message
body, resources_referenced, note_links = self.handle_markdown_links(
note["title"],
Expand Down
2 changes: 1 addition & 1 deletion src/formats/textbundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


class Converter(converter.BaseConverter):
Expand Down
2 changes: 1 addition & 1 deletion src/formats/zoho_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import common
import converter
import intermediate_format as imf
import markdown_lib
import markdown_lib.common


class Converter(converter.BaseConverter):
Expand Down
25 changes: 15 additions & 10 deletions src/markdown_lib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from markdown.extensions import Extension
import pypandoc

import markdown_lib.html_preprocessing
import markdown_lib.html_filter


LOGGER = logging.getLogger("jimmy")
Expand Down Expand Up @@ -263,15 +263,17 @@ def get_inline_tags(text: str, start_characters: list[str]) -> list[str]:
# fmt:on


def html_to_markdown(text_html: bytes | str):
def html_to_markdown(text_html: bytes | str, custom_filter: list | None = None):
# some needed preprocessing
soup = BeautifulSoup(text_html, "html.parser")
markdown_lib.html_preprocessing.div_checklists(soup)
markdown_lib.html_preprocessing.highlighting(soup)
markdown_lib.html_preprocessing.iframes_to_links(soup)
markdown_lib.html_preprocessing.streamline_tables(soup)
markdown_lib.html_preprocessing.synology_note_station_fix_img_src(soup)
markdown_lib.html_preprocessing.whitespace_in_math(soup)
markdown_lib.html_filter.div_checklists(soup)
markdown_lib.html_filter.highlighting(soup)
markdown_lib.html_filter.iframes_to_links(soup)
markdown_lib.html_filter.streamline_tables(soup)
markdown_lib.html_filter.whitespace_in_math(soup)
if custom_filter is not None:
for filter_ in custom_filter:
filter_(soup)
text_html_filtered = str(soup)

# writer: json ast -> markdown
Expand All @@ -292,7 +294,10 @@ def html_to_markdown(text_html: bytes | str):


def markup_to_markdown(
text: bytes | str, format_: str = "html", resource_folder: Path = Path("tmp_media")
text: bytes | str,
format_: str = "html",
resource_folder: Path = Path("tmp_media"),
custom_filter: list | None = None,
) -> str:
# Route everything through this function to get a single path of truth.
if format_.startswith("html"):
Expand All @@ -314,7 +319,7 @@ def markup_to_markdown(

# HTML filter: HTML -> filter -> HTML
# writer: HTML -> Markdown
return html_to_markdown(text_html)
return html_to_markdown(text_html, custom_filter)


# Problem: "//" is part of many URI (between scheme and host).
Expand Down
2 changes: 1 addition & 1 deletion src/markdown_lib/evernote.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding

import markdown_lib
import markdown_lib.common


# TODO: simplify
Expand Down
File renamed without changes.

0 comments on commit 381dec5

Please sign in to comment.