From c5b508487c394a92d8bc3a04c512e7a231c38b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20D=C3=B6rfelt?= Date: Sun, 20 Oct 2024 13:42:56 +0200 Subject: [PATCH] nimbusnote: improve list conversion (indentation; checklist, numbered and unnumbered lists) --- src/formats/nimbus_note.py | 46 +++++++++++++++++++++++++++++------- src/formats/zoho_notebook.py | 10 ++++---- test/data | 2 +- 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/src/formats/nimbus_note.py b/src/formats/nimbus_note.py index d271ad58..8931cfdc 100644 --- a/src/formats/nimbus_note.py +++ b/src/formats/nimbus_note.py @@ -11,7 +11,7 @@ import markdown_lib.common -def clean_tables(soup): +def streamline_tables(soup: BeautifulSoup): for table in soup.find_all("table"): tags_to_remove = ["div", "span"] for tag in tags_to_remove: @@ -19,15 +19,43 @@ def clean_tables(soup): element.unwrap() -def assign_lists(_soup): - # TODO: +def streamline_lists(soup: BeautifulSoup): # - all lists are unnumbered lists (ul) # - type is in the class attr (list-item-number, -bullet, -checkbox) # - indentation is in the class attr (indent-0) - - # for task_list in soup.find_all("ul", class_="checklist"): - # ... - pass + for list_ in soup.find_all("ul"): + current_indent = 0 + current_list = list_ + for item in list_.find_all("li"): + item_type = [ + i[len("list-item-") :] + for i in item["class"] + if i.startswith("list-item-") + ][0] + list_type = {"checkbox": "ul", "bullet": "ul", "number": "ol"}[item_type] + if item_type == "checkbox": + item.insert(0, soup.new_tag("input", type="checkbox")) + + indent = [i for i in item["class"] if i.startswith("indent-")][0] + indent_int = int(indent[len("indent-") :]) # 1 digit number always + if indent_int == 0: + # would be sufficient to do only one time + current_list.name = list_type + if item_type == "checkbox" and "checklist" not in current_list["class"]: + current_list["class"] = ["checklist"] # drop the other classes + if indent_int > current_indent: + # new child list + new_list = soup.new_tag(list_type) + current_list.append(new_list) + current_list = new_list + current_indent = indent_int + elif indent_int < current_indent: + # find parent list at the corresponding level + for _ in range(current_indent - indent_int): + current_list = current_list.parent + + item.attrs = {} # remove all attributes + current_list.append(item) class Converter(converter.BaseConverter): @@ -81,8 +109,8 @@ def convert(self, file_or_folder: Path): ) soup = BeautifulSoup(note_body_html, "html.parser") - clean_tables(soup) - assign_lists(soup) + streamline_tables(soup) + streamline_lists(soup) note_body_markdown = markdown_lib.common.markup_to_markdown(str(soup)) resources = self.handle_markdown_links(note_body_markdown, temp_folder_note) diff --git a/src/formats/zoho_notebook.py b/src/formats/zoho_notebook.py index 57ca43a7..cfc9c979 100644 --- a/src/formats/zoho_notebook.py +++ b/src/formats/zoho_notebook.py @@ -12,7 +12,7 @@ import markdown_lib -def clean_tables(soup): +def streamline_tables(soup: BeautifulSoup): for table in soup.find_all("table"): for row in table.find_all("tr"): for td in row.find_all("td"): @@ -22,9 +22,7 @@ def clean_tables(soup): td.append(text_only) -def clean_task_lists(soup): - # TODO: Not sure why the cleaned task lists still don't work. - # It works online. Maybe caused by an old pandoc version. +def streamline_checklists(soup: BeautifulSoup): for task_list in soup.find_all("div", class_="checklist"): task_list.name = "ul" # remove the spans @@ -132,8 +130,8 @@ def convert_note(self, file_: Path): # convert the note body to Markdown if soup.body is not None: - clean_tables(soup) - clean_task_lists(soup) + streamline_tables(soup) + streamline_checklists(soup) body = markdown_lib.common.markup_to_markdown(str(soup)) # resources and internal links diff --git a/test/data b/test/data index 493ef460..e25dbd44 160000 --- a/test/data +++ b/test/data @@ -1 +1 @@ -Subproject commit 493ef460f4438b190e70d8fcb78d41ba843ccc94 +Subproject commit e25dbd447fb611d0375f05231154fdfb3344a1aa